From 0dec90e9f77cfe124d0fde8e994f61de4222064d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Jan 2017 20:36:00 +0100 Subject: [PATCH] Use global abbreviation data languages and remove duplicates --- spacy/de/language_data.py | 3 ++- spacy/de/tokenizer_exceptions.py | 33 +------------------------------- spacy/en/language_data.py | 4 +++- spacy/en/tokenizer_exceptions.py | 29 +--------------------------- spacy/es/language_data.py | 5 ++++- spacy/es/tokenizer_exceptions.py | 28 +-------------------------- spacy/fr/language_data.py | 7 +++++-- spacy/hu/language_data.py | 3 ++- spacy/hu/tokenizer_exceptions.py | 27 -------------------------- spacy/it/language_data.py | 5 ++++- spacy/nl/language_data.py | 5 ++++- spacy/pt/language_data.py | 5 ++++- spacy/sv/language_data.py | 5 ++++- 13 files changed, 35 insertions(+), 124 deletions(-) diff --git a/spacy/de/language_data.py b/spacy/de/language_data.py index f64c915f6..5e09c0eb3 100644 --- a/spacy/de/language_data.py +++ b/spacy/de/language_data.py @@ -9,12 +9,13 @@ from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) TAG_MAP = dict(TAG_MAP) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) diff --git a/spacy/de/tokenizer_exceptions.py b/spacy/de/tokenizer_exceptions.py index b0561a223..0d8dc54e8 100644 --- a/spacy/de/tokenizer_exceptions.py +++ b/spacy/de/tokenizer_exceptions.py @@ -516,11 +516,6 @@ TOKENIZER_EXCEPTIONS = { ORTH_ONLY = [ - "'", - "\\\")", - "", - "a.", - "ä.", "A.C.", "a.D.", "A.D.", @@ -530,24 +525,20 @@ ORTH_ONLY = [ "Abs.", "adv.", "al.", - "b.", "B.A.", "B.Sc.", "betr.", "biol.", "Biol.", - "c.", "ca.", "Chr.", "Cie.", "co.", "Co.", - "d.", "D.C.", "Dipl.-Ing.", "Dipl.", "Dr.", - "e.", "e.g.", "e.V.", "ehem.", @@ -555,79 +546,57 @@ ORTH_ONLY = [ "erm.", "etc.", "ev.", - "f.", - "g.", "G.m.b.H.", "geb.", "Gebr.", "gem.", - "h.", "h.c.", "Hg.", "hrsg.", "Hrsg.", - "i.", "i.A.", "i.e.", "i.G.", "i.Tr.", "i.V.", "Ing.", - "j.", "jr.", "Jr.", "jun.", "jur.", - "k.", "K.O.", - "l.", "L.A.", "lat.", - "m.", "M.A.", "m.E.", "m.M.", "M.Sc.", "Mr.", - "n.", "N.Y.", "N.Y.C.", "nat.", "ö." - "o.", "o.a.", "o.ä.", "o.g.", "o.k.", "O.K.", - "p.", "p.a.", "p.s.", "P.S.", "pers.", "phil.", - "q.", "q.e.d.", - "r.", "R.I.P.", "rer.", - "s.", "sen.", "St.", "std.", - "t.", - "u.", - "ü.", "u.a.", "U.S.", "U.S.A.", "U.S.S.", - "v.", "Vol.", "vs.", - "w.", - "wiss.", - "x.", - "y.", - "z." + "wiss." ] diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index a75f2b9d5..1fcbf277e 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -37,14 +37,16 @@ def get_time_exc(hours): return exc -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) TAG_MAP = dict(TAG_MAP) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + __all__ = ["TOKENIZER_EXCEPTIONS", "TAG_MAP", "STOP_WORDS", "LEMMA_RULES", "MORPH_RULES"] diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 44ad605a4..38fc33cfb 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -718,39 +718,25 @@ for string in EXCLUDE_EXC: ORTH_ONLY = [ "'d", - "''", - "a.", "a.m.", "Adm.", - "b.", "Bros.", - "c.", "co.", "Co.", "Corp.", - "d.", "D.C.", "Dr.", - "e.", "e.g.", "E.g.", "E.G.", - "f.", - "g.", "Gen.", "Gov.", - "h.", - "i.", "i.e.", "I.e.", "I.E.", "Inc.", - "j.", "Jr.", - "k.", - "l.", "Ltd.", - "m.", "Md.", "Messrs.", "Mo.", @@ -758,24 +744,11 @@ ORTH_ONLY = [ "Mr.", "Mrs.", "Ms.", - "n.", - "o.", - "p.", "p.m.", "Ph.D.", - "q.", - "r.", "Rep.", "Rev.", - "s.", "Sen.", "St.", - "t.", - "u.", - "v.", - "vs.", - "w.", - "x.", - "y.", - "z." + "vs." ] diff --git a/spacy/es/language_data.py b/spacy/es/language_data.py index 3357c9ac8..7c44752cb 100644 --- a/spacy/es/language_data.py +++ b/spacy/es/language_data.py @@ -40,11 +40,14 @@ def get_time_exc(hours): return exc -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) STOP_WORDS = set(STOP_WORDS) + +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, get_time_exc(range(1, 12 + 1))) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/es/tokenizer_exceptions.py b/spacy/es/tokenizer_exceptions.py index f9259ce93..93bc74642 100644 --- a/spacy/es/tokenizer_exceptions.py +++ b/spacy/es/tokenizer_exceptions.py @@ -85,55 +85,29 @@ TOKENIZER_EXCEPTIONS = { ORTH_ONLY = [ - "a.", "a.C.", "a.J.C.", "apdo.", "Av.", "Avda.", - "b.", - "c.", "Cía.", - "d.", - "e.", "etc.", - "f.", - "g.", "Gob.", "Gral.", - "h.", - "i.", "Ing.", - "j.", "J.C.", - "k.", - "l.", "Lic.", - "m.", "m.n.", - "n.", "no.", "núm.", - "o.", - "p.", "P.D.", "Prof.", "Profa.", - "q.", "q.e.p.d." - "r.", - "s.", "S.A.", "S.L.", "s.s.s.", "Sr.", "Sra.", - "Srta.", - "t.", - "u.", - "v.", - "w.", - "x.", - "y.", - "z." + "Srta." ] diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py index e612fe064..bbbeb1535 100644 --- a/spacy/fr/language_data.py +++ b/spacy/fr/language_data.py @@ -2,13 +2,16 @@ from __future__ import unicode_literals from .. import language_data as base -from ..language_data import strings_to_exc +from ..language_data import strings_to_exc, update_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py index 94eeb6f4d..0cb4ffd38 100644 --- a/spacy/hu/language_data.py +++ b/spacy/hu/language_data.py @@ -11,13 +11,14 @@ from .tokenizer_exceptions import OTHER_EXC from .. import language_data as base STOP_WORDS = set(STOP_WORDS) -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES + TOKENIZER_PREFIXES TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES TOKENIZER_INFIXES = TOKENIZER_INFIXES # HYPHENS = [six.unichr(cp) for cp in [173, 8211, 8212, 8213, 8722, 9472]] +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py index 627035bb8..46122564c 100644 --- a/spacy/hu/tokenizer_exceptions.py +++ b/spacy/hu/tokenizer_exceptions.py @@ -111,7 +111,6 @@ Vcs. Vhr. X.Y. Zs. -a. a.C. ac. adj. @@ -126,7 +125,6 @@ ang. arch. at. aug. -b. b.a. b.s. b.sc. @@ -141,7 +139,6 @@ br. bsc. bt. btk. -c. ca. cc. cca. @@ -155,7 +152,6 @@ csc. csüt. cső. ctv. -d. dbj. dd. ddr. @@ -170,7 +166,6 @@ dolg. dr. du. dzs. -e. ea. ed. eff. @@ -186,7 +181,6 @@ etc. ev. ezr. eü. -f. f.h. f.é. fam. @@ -213,7 +207,6 @@ főig. főisk. főtörm. főv. -g. gazd. gimn. gk. @@ -225,7 +218,6 @@ gy. gyak. gyártm. gör. -h. hads. hallg. hdm. @@ -266,7 +258,6 @@ isk. ism. izr. iá. -j. jan. jav. jegyz. @@ -278,7 +269,6 @@ jr. jvb. júl. jún. -k. karb. kat. kb. @@ -313,7 +303,6 @@ közl. közp. közt. kü. -l. lat. ld. legs. @@ -324,7 +313,6 @@ lt. ltd. ltp. luth. -m. m.a. m.s. m.sc. @@ -359,7 +347,6 @@ műh. műsz. műv. művez. -n. nagyker. nagys. nat. @@ -372,7 +359,6 @@ ny. nyilv. nyrt. nyug. -o. obj. okl. okt. @@ -381,7 +367,6 @@ orsz. ort. ov. ovh. -p. pf. pg. ph.d @@ -404,8 +389,6 @@ pság. ptk. pu. pü. -q. -r. r.k. rac. rad. @@ -420,7 +403,6 @@ rkt. rt. rtg. röv. -s. s.b. s.k. sa. @@ -450,7 +432,6 @@ szt. szubj. szöv. szül. -t. tanm. tb. tbk. @@ -476,13 +457,11 @@ tvr. ty. törv. tü. -u. ua. ui. unit. uo. uv. -v. vas. vb. vegy. @@ -501,9 +480,6 @@ vv. vál. vízv. vö. -w. -y. -z. zrt. zs. Ész. @@ -520,7 +496,6 @@ zs. évf. í. ó. -ö. össz. ötk. özv. @@ -528,7 +503,6 @@ zs. úm. ún. út. -ü. üag. üd. üdv. @@ -544,6 +518,5 @@ zs. """.strip().split() OTHER_EXC = """ -'' -e """.strip().split() diff --git a/spacy/it/language_data.py b/spacy/it/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/it/language_data.py +++ b/spacy/it/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/nl/language_data.py b/spacy/nl/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/nl/language_data.py +++ b/spacy/nl/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/sv/language_data.py b/spacy/sv/language_data.py index 8683f83ac..a4a657c33 100644 --- a/spacy/sv/language_data.py +++ b/spacy/sv/language_data.py @@ -7,8 +7,11 @@ from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) STOP_WORDS = set(STOP_WORDS) +TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) + + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]