From 1be5da1ac6271bc41eb18e996d1ad53cf154e592 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Sat, 14 Jan 2017 15:51:59 +0100 Subject: [PATCH] Fixed Hungarian tokenizer for numbers --- spacy/hu/__init__.py | 3 ++ spacy/hu/language_data.py | 8 +--- spacy/hu/punctuation.py | 44 +++++++++++++-------- spacy/hu/tokenizer_exceptions.py | 31 ++++++++++++++- spacy/language_data/punctuation.py | 6 +-- spacy/language_data/tokenizer_exceptions.py | 6 +-- spacy/tests/hu/test_tokenizer.py | 42 +++++++++++++++----- 7 files changed, 99 insertions(+), 41 deletions(-) diff --git a/spacy/hu/__init__.py b/spacy/hu/__init__.py index 2343b4606..652ea379c 100644 --- a/spacy/hu/__init__.py +++ b/spacy/hu/__init__.py @@ -1,6 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals, print_function +from spacy.hu.tokenizer_exceptions import TOKEN_MATCH from .language_data import * from ..attrs import LANG from ..language import Language @@ -21,3 +22,5 @@ class Hungarian(Language): infixes = tuple(TOKENIZER_INFIXES) stop_words = set(STOP_WORDS) + + token_match = TOKEN_MATCH diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py index 49652c5ac..b4cadda16 100644 --- a/spacy/hu/language_data.py +++ b/spacy/hu/language_data.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals -import six - from spacy.language_data import strings_to_exc, update_exc from .punctuation import * from .stop_words import STOP_WORDS @@ -10,19 +8,15 @@ from .tokenizer_exceptions import ABBREVIATIONS from .tokenizer_exceptions import OTHER_EXC from .. import language_data as base - STOP_WORDS = set(STOP_WORDS) - TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) - TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES -TOKENIZER_SUFFIXES = base.TOKENIZER_SUFFIXES + TOKENIZER_SUFFIXES +TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES TOKENIZER_INFIXES = TOKENIZER_INFIXES - __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/punctuation.py b/spacy/hu/punctuation.py index e28052fd3..13d348666 100644 --- a/spacy/hu/punctuation.py +++ b/spacy/hu/punctuation.py @@ -1,25 +1,35 @@ # encoding: utf8 from __future__ import unicode_literals -from ..language_data.punctuation import ALPHA, ALPHA_LOWER, ALPHA_UPPER, LIST_ELLIPSES +from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \ + CURRENCY, LIST_PUNCT, ALPHA +CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿" -TOKENIZER_SUFFIXES = [ - r'(?<=[{al})])-e'.format(al=ALPHA_LOWER) -] - -TOKENIZER_INFIXES = [ - r'(?<=[0-9])-(?=[0-9])', - r'(?<=[0-9])[+\-\*/^](?=[0-9])', - r'(?<=[{a}])--(?=[{a}])', - r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), - r'(?<=[0-9{a}])"(?=[\-{a}])'.format(a=ALPHA), - r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA) -] - - -TOKENIZER_INFIXES += LIST_ELLIPSES +TOKENIZER_SUFFIXES = ( + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES + + [ + r'(?<=[0-9])\+', + r'(?<=°[FfCcKk])\.', + r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), + r'(?<=[0-9])(?:{u})'.format(u=UNITS), + r'(?<=[{al}{p}{c}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES, c=CURRENCY_SYMBOLS), + r'(?<=[{al})])-e'.format(al=ALPHA_LOWER) + ] +) +TOKENIZER_INFIXES = ( + LIST_ELLIPSES + + [ + r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[0-9{a}])({q})(?=[\-{a}])'.format(a=ALPHA, q=QUOTES), + ] +) __all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py index 46122564c..24ef669e2 100644 --- a/spacy/hu/tokenizer_exceptions.py +++ b/spacy/hu/tokenizer_exceptions.py @@ -1,6 +1,11 @@ # encoding: utf8 from __future__ import unicode_literals +import re + +from spacy.language_data.punctuation import ALPHA_LOWER, CURRENCY +from ..language_data.tokenizer_exceptions import URL_PATTERN + ABBREVIATIONS = """ AkH. Aö. @@ -107,6 +112,7 @@ Tel. Ty. Tyr. Ui. +Ut. Vcs. Vhr. X.Y. @@ -212,6 +218,7 @@ gimn. gk. gkv. gondn. +Gr. gr. grav. gy. @@ -237,6 +244,7 @@ ht. htb. hv. hőm. +ie. i.e. i.sz. id. @@ -271,6 +279,7 @@ júl. jún. karb. kat. +kath. kb. kcs. kd. @@ -286,6 +295,7 @@ kk. kkt. klin. kp. +Kr. krt. kt. ktsg. @@ -357,6 +367,7 @@ nov. nu. ny. nyilv. +Nyrt. nyrt. nyug. obj. @@ -409,6 +420,7 @@ sa. sel. sgt. sm. +St. st. stat. stb. @@ -478,8 +490,11 @@ vs. vsz. vv. vál. +várm. +Várm. vízv. vö. +Zrt. zrt. zs. Ész. @@ -502,6 +517,7 @@ zs. ú. úm. ún. +ú.n. út. üag. üd. @@ -510,7 +526,6 @@ zs. ümk. ütk. üv. -ő. ű. őrgy. őrpk. @@ -520,3 +535,17 @@ zs. OTHER_EXC = """ -e """.strip().split() + +ORD_NUM_OR_DATE = "([A-Z0-9]+[./-])*(\d+\.?)" +_NUM = "[+\-]?\d+([,.]\d+)*" +_OPS = "[=<>+\-\*/^()÷%²]" +_SUFFIES = "-[{a}]+".format(a=ALPHA_LOWER) +NUMERIC_EXP = "\(?({n})(({o})({n}))*[)%]?".format(n=_NUM, o=_OPS) +TIME_EXP = "\d+(:\d+)*(\.\d+)?" + +NUMS = "(({ne})|({t})|({on})|({c}))({s})?".format( + ne=NUMERIC_EXP, t=TIME_EXP, on=ORD_NUM_OR_DATE, + c=CURRENCY, s=_SUFFIES +) + +TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=URL_PATTERN, n=NUMS)).match diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index d8ed19ca1..a7244ab19 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -57,14 +57,14 @@ LIST_PUNCT = list(_PUNCT.strip().split()) LIST_HYPHENS = list(_HYPHENS.strip().split()) -ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '') -ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '') +ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '').replace('\n', '') +ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace(' ', '') ALPHA = ALPHA_LOWER + ALPHA_UPPER QUOTES = _QUOTES.strip().replace(' ', '|') CURRENCY = _CURRENCY.strip().replace(' ', '|') -UNITS = _UNITS.strip().replace(' ', '|') +UNITS = _UNITS.strip().replace(' ', '|').replace('\n', '|') HYPHENS = _HYPHENS.strip().replace(' ', '|') diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py index 6551440f2..33a2417d1 100644 --- a/spacy/language_data/tokenizer_exceptions.py +++ b/spacy/language_data/tokenizer_exceptions.py @@ -2,10 +2,10 @@ from __future__ import unicode_literals import re -_URL_PATTERN = r''' -^((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)$ +URL_PATTERN = r''' +((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w\-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?) '''.strip() -TOKEN_MATCH = re.compile(_URL_PATTERN).match +TOKEN_MATCH = re.compile("^{}$".format(URL_PATTERN)).match __all__ = ['TOKEN_MATCH'] diff --git a/spacy/tests/hu/test_tokenizer.py b/spacy/tests/hu/test_tokenizer.py index 0b76da0c6..c08511c1b 100644 --- a/spacy/tests/hu/test_tokenizer.py +++ b/spacy/tests/hu/test_tokenizer.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import pytest - DEFAULT_TESTS = [ ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), @@ -24,11 +23,13 @@ DEFAULT_TESTS = [ HYPHEN_TESTS = [ ('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']), + ('Szabolcs-Szatmár-Bereg megye', ['Szabolcs-Szatmár-Bereg', 'megye']), ('Egy -nak.', ['Egy', '-nak', '.']), ('Egy bel-.', ['Egy', 'bel-', '.']), ('Dinnye-domb-.', ['Dinnye-domb-', '.']), ('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']), ('Lakik-e', ['Lakik', '-e']), + ('A--B', ['A', '--' 'B']), ('Lakik-e?', ['Lakik', '-e', '?']), ('Lakik-e.', ['Lakik', '-e', '.']), ('Lakik-e...', ['Lakik', '-e', '...']), @@ -89,11 +90,15 @@ NUMBER_TESTS = [ ('A -23,12 van.', ['A', '-23,12', 'van', '.']), ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']), ('A -23,12-ben.', ['A', '-23,12-ben', '.']), - ('A 2+3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2 +3 van.', ['A', '2', '+', '3', 'van', '.']), + ('A 2+3 van.', ['A', '2+3', 'van', '.']), + ('A 2<3 van.', ['A', '2<3', 'van', '.']), + ('A 2=3 van.', ['A', '2=3', 'van', '.']), + ('A 2÷3 van.', ['A', '2÷3', 'van', '.']), + ('A (2÷3)-2/5=1 van.', ['A', '(2÷3)-2/5=1', 'van', '.']), + ('A 2 +3 van.', ['A', '2', '+3', 'van', '.']), ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']), ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2*3 van.', ['A', '2', '*', '3', 'van', '.']), + ('A 2*3 van.', ['A', '2*3', 'van', '.']), ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']), ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']), ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']), @@ -142,6 +147,7 @@ NUMBER_TESTS = [ ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']), ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']), ('A 2002--2003-ben.', ['A', '2002--2003-ben', '.']), + ('A +0,99% van.', ['A', '+0,99%', 'van', '.']), ('A -0,99% van.', ['A', '-0,99%', 'van', '.']), ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']), ('A -0,99%.', ['A', '-0,99%', '.']), @@ -194,7 +200,16 @@ NUMBER_TESTS = [ ('A III/c-ben.', ['A', 'III/c-ben', '.']), ('A TU–154 van.', ['A', 'TU–154', 'van', '.']), ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']), - ('A TU–154-ben.', ['A', 'TU–154-ben', '.']) + ('A TU–154-ben.', ['A', 'TU–154-ben', '.']), + ('A 5cm³', ['A', '5', 'cm³']), + ('A 5 $-ban', ['A', '5', '$-ban']), + ('A 5$-ban', ['A', '5$-ban']), + ('A 5$.', ['A', '5', '$', '.']), + ('A 5$', ['A', '5', '$']), + ('A $5', ['A', '$', '5']), + ('A 5km/h', ['A', '5', 'km/h']), + ('A 75%+1-100%-ig', ['A', '75%+1-100%-ig']), + ('A 5km/h.', ['A', '5', 'km/h', '.']), ] QUOTE_TESTS = [ @@ -202,15 +217,15 @@ QUOTE_TESTS = [ ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), ('Egy 24"-os monitor.', ['Egy', '24', '"', '-os', 'monitor', '.']), - ("A don't van.", ['A', "don't", 'van', '.']) + # ("A don't van.", ['A', "don't", 'van', '.']) ] DOT_TESTS = [ ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), ('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.']), ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), - ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), - ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), + ('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']), + ('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']), ('A .hu.', ['A', '.hu', '.']), ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), ('A pl.', ['A', 'pl.']), @@ -223,9 +238,16 @@ DOT_TESTS = [ ('Valami ... más.', ['Valami', '...', 'más', '.']) ] +WIKI_TESTS = [ + ('!"', ['!', '"']), + ('!"-lel', ['!', '"', '-lel']), + ('""-sorozat ', ['"', '"', '-sorozat']), + ('"(Köszönöm', ['"', '(', 'Köszönöm']), + ('(törvénykönyv)-ben ', ['(', 'törvénykönyv', ')', '-ben']), + ('"(...)"–sokkal ', ['"', '(', '...', ')', '"', '–sokkal']), +] -TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS # + NUMBER_TESTS + HYPHEN_TESTS - +TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS # + HYPHEN_TESTS # + WIKI_TESTS @pytest.mark.parametrize('text,expected_tokens', TESTCASES) def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):