Use global abbreviation data languages and remove duplicates

2025-11-04 09:57:26 +03:00 · 2017-01-08 20:36:00 +01:00 · 2017-01-08 20:36:00 +01:00 · 0dec90e9f7
commit 0dec90e9f7
parent 7c3cb2a652
13 changed files with 35 additions and 124 deletions
--- a/spacy/de/language_data.py
+++ b/spacy/de/language_data.py
@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY


-TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 TAG_MAP = dict(TAG_MAP)
 STOP_WORDS = set(STOP_WORDS)


+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))


--- a/spacy/de/tokenizer_exceptions.py
+++ b/spacy/de/tokenizer_exceptions.py
@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = {


 ORTH_ONLY = [
-    "'",
-    "\\\")",
-    "<space>",
-    "a.",
-    "ä.",
    "A.C.",
    "a.D.",
    "A.D.",
@ -530,24 +525,20 @@ ORTH_ONLY = [
    "Abs.",
    "adv.",
    "al.",
-    "b.",
    "B.A.",
    "B.Sc.",
    "betr.",
    "biol.",
    "Biol.",
-    "c.",
    "ca.",
    "Chr.",
    "Cie.",
    "co.",
    "Co.",
-    "d.",
    "D.C.",
    "Dipl.-Ing.",
    "Dipl.",
    "Dr.",
-    "e.",
    "e.g.",
    "e.V.",
    "ehem.",
@ -555,79 +546,57 @@ ORTH_ONLY = [
    "erm.",
    "etc.",
    "ev.",
-    "f.",
-    "g.",
    "G.m.b.H.",
    "geb.",
    "Gebr.",
    "gem.",
-    "h.",
    "h.c.",
    "Hg.",
    "hrsg.",
    "Hrsg.",
-    "i.",
    "i.A.",
    "i.e.",
    "i.G.",
    "i.Tr.",
    "i.V.",
    "Ing.",
-    "j.",
    "jr.",
    "Jr.",
    "jun.",
    "jur.",
-    "k.",
    "K.O.",
-    "l.",
    "L.A.",
    "lat.",
-    "m.",
    "M.A.",
    "m.E.",
    "m.M.",
    "M.Sc.",
    "Mr.",
-    "n.",
    "N.Y.",
    "N.Y.C.",
    "nat.",
    "ö."
-    "o.",
    "o.a.",
    "o.ä.",
    "o.g.",
    "o.k.",
    "O.K.",
-    "p.",
    "p.a.",
    "p.s.",
    "P.S.",
    "pers.",
    "phil.",
-    "q.",
    "q.e.d.",
-    "r.",
    "R.I.P.",
    "rer.",
-    "s.",
    "sen.",
    "St.",
    "std.",
-    "t.",
-    "u.",
-    "ü.",
    "u.a.",
    "U.S.",
    "U.S.A.",
    "U.S.S.",
-    "v.",
    "Vol.",
    "vs.",
-    "w.",
-    "wiss.",
-    "x.",
-    "y.",
-    "z."
+    "wiss."
 ]
--- a/spacy/en/language_data.py
+++ b/spacy/en/language_data.py
@ -37,14 +37,16 @@ def get_time_exc(hours):
    return exc


-TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 TAG_MAP = dict(TAG_MAP)
 STOP_WORDS = set(STOP_WORDS)


+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
 update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’"))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+

 __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"]
--- a/spacy/en/tokenizer_exceptions.py
+++ b/spacy/en/tokenizer_exceptions.py
@ -718,39 +718,25 @@ for string in EXCLUDE_EXC:

 ORTH_ONLY = [
    "'d",
-    "''",
-    "a.",
    "a.m.",
    "Adm.",
-    "b.",
    "Bros.",
-    "c.",
    "co.",
    "Co.",
    "Corp.",
-    "d.",
    "D.C.",
    "Dr.",
-    "e.",
    "e.g.",
    "E.g.",
    "E.G.",
-    "f.",
-    "g.",
    "Gen.",
    "Gov.",
-    "h.",
-    "i.",
    "i.e.",
    "I.e.",
    "I.E.",
    "Inc.",
-    "j.",
    "Jr.",
-    "k.",
-    "l.",
    "Ltd.",
-    "m.",
    "Md.",
    "Messrs.",
    "Mo.",
@ -758,24 +744,11 @@ ORTH_ONLY = [
    "Mr.",
    "Mrs.",
    "Ms.",
-    "n.",
-    "o.",
-    "p.",
    "p.m.",
    "Ph.D.",
-    "q.",
-    "r.",
    "Rep.",
    "Rev.",
-    "s.",
    "Sen.",
    "St.",
-    "t.",
-    "u.",
-    "v.",
-    "vs.",
-    "w.",
-    "x.",
-    "y.",
-    "z."
+    "vs."
 ]
--- a/spacy/es/language_data.py
+++ b/spacy/es/language_data.py
@ -40,11 +40,14 @@ def get_time_exc(hours):
    return exc


-TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 STOP_WORDS = set(STOP_WORDS)

+
+TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY))
 update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1)))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+

 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/es/tokenizer_exceptions.py
+++ b/spacy/es/tokenizer_exceptions.py
@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = {


 ORTH_ONLY = [
-    "a.",
    "a.C.",
    "a.J.C.",
    "apdo.",
    "Av.",
    "Avda.",
-    "b.",
-    "c.",
    "Cía.",
-    "d.",
-    "e.",
    "etc.",
-    "f.",
-    "g.",
    "Gob.",
    "Gral.",
-    "h.",
-    "i.",
    "Ing.",
-    "j.",
    "J.C.",
-    "k.",
-    "l.",
    "Lic.",
-    "m.",
    "m.n.",
-    "n.",
    "no.",
    "núm.",
-    "o.",
-    "p.",
    "P.D.",
    "Prof.",
    "Profa.",
-    "q.",
    "q.e.p.d."
-    "r.",
-    "s.",
    "S.A.",
    "S.L.",
    "s.s.s.",
    "Sr.",
    "Sra.",
-    "Srta.",
-    "t.",
-    "u.",
-    "v.",
-    "w.",
-    "x.",
-    "y.",
-    "z."
+    "Srta."
 ]
--- a/spacy/fr/language_data.py
+++ b/spacy/fr/language_data.py
@ -2,13 +2,16 @@
 from __future__ import unicode_literals

 from .. import language_data as base
-from ..language_data import strings_to_exc
+from ..language_data import strings_to_exc, update_exc

 from .stop_words import STOP_WORDS


-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)


+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/hu/language_data.py
+++ b/spacy/hu/language_data.py
@ -11,13 +11,14 @@ from .tokenizer_exceptions import OTHER_EXC
 from .. import language_data as base

 STOP_WORDS = set(STOP_WORDS)
-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES
 TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES
 TOKENIZER_INFIXES = TOKENIZER_INFIXES

 # HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]]

+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC))
 update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS))

--- a/spacy/hu/tokenizer_exceptions.py
+++ b/spacy/hu/tokenizer_exceptions.py
@ -111,7 +111,6 @@ Vcs.
 Vhr.
 X.Y.
 Zs.
-a.
 a.C.
 ac.
 adj.
@ -126,7 +125,6 @@ ang.
 arch.
 at.
 aug.
-b.
 b.a.
 b.s.
 b.sc.
@ -141,7 +139,6 @@ br.
 bsc.
 bt.
 btk.
-c.
 ca.
 cc.
 cca.
@ -155,7 +152,6 @@ csc.
 csüt.
 cső.
 ctv.
-d.
 dbj.
 dd.
 ddr.
@ -170,7 +166,6 @@ dolg.
 dr.
 du.
 dzs.
-e.
 ea.
 ed.
 eff.
@ -186,7 +181,6 @@ etc.
 ev.
 ezr.
 eü.
-f.
 f.h.
 f.é.
 fam.
@ -213,7 +207,6 @@ főig.
 főisk.
 főtörm.
 főv.
-g.
 gazd.
 gimn.
 gk.
@ -225,7 +218,6 @@ gy.
 gyak.
 gyártm.
 gör.
-h.
 hads.
 hallg.
 hdm.
@ -266,7 +258,6 @@ isk.
 ism.
 izr.
 iá.
-j.
 jan.
 jav.
 jegyz.
@ -278,7 +269,6 @@ jr.
 jvb.
 júl.
 jún.
-k.
 karb.
 kat.
 kb.
@ -313,7 +303,6 @@ közl.
 közp.
 közt.
 kü.
-l.
 lat.
 ld.
 legs.
@ -324,7 +313,6 @@ lt.
 ltd.
 ltp.
 luth.
-m.
 m.a.
 m.s.
 m.sc.
@ -359,7 +347,6 @@ műh.
 műsz.
 műv.
 művez.
-n.
 nagyker.
 nagys.
 nat.
@ -372,7 +359,6 @@ ny.
 nyilv.
 nyrt.
 nyug.
-o.
 obj.
 okl.
 okt.
@ -381,7 +367,6 @@ orsz.
 ort.
 ov.
 ovh.
-p.
 pf.
 pg.
 ph.d
@ -404,8 +389,6 @@ pság.
 ptk.
 pu.
 pü.
-q.
-r.
 r.k.
 rac.
 rad.
@ -420,7 +403,6 @@ rkt.
 rt.
 rtg.
 röv.
-s.
 s.b.
 s.k.
 sa.
@ -450,7 +432,6 @@ szt.
 szubj.
 szöv.
 szül.
-t.
 tanm.
 tb.
 tbk.
@ -476,13 +457,11 @@ tvr.
 ty.
 törv.
 tü.
-u.
 ua.
 ui.
 unit.
 uo.
 uv.
-v.
 vas.
 vb.
 vegy.
@ -501,9 +480,6 @@ vv.
 vál.
 vízv.
 vö.
-w.
-y.
-z.
 zrt.
 zs.
 Ész.
@ -520,7 +496,6 @@ zs.
 évf.
 í.
 ó.
-ö.
 össz.
 ötk.
 özv.
@ -528,7 +503,6 @@ zs.
 úm.
 ún.
 út.
-ü.
 üag.
 üd.
 üdv.
@ -544,6 +518,5 @@ zs.
 """.strip().split()

 OTHER_EXC = """
-''
 -e
 """.strip().split()
--- a/spacy/it/language_data.py
+++ b/spacy/it/language_data.py
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS


-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)


+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/nl/language_data.py
+++ b/spacy/nl/language_data.py
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS


-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)


+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/pt/language_data.py
+++ b/spacy/pt/language_data.py
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS


-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)


+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]
--- a/spacy/sv/language_data.py
+++ b/spacy/sv/language_data.py
@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc
 from .stop_words import STOP_WORDS


-TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
 STOP_WORDS = set(STOP_WORDS)


+TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS)
+update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS))
+
+
 __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]