From 63037e79af54aefa734fd92258c0e6530c7977aa Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Sat, 14 Jan 2017 16:30:11 +0100 Subject: [PATCH] Fixed hyphen handling in the Hungarian tokenizer. --- spacy/hu/language_data.py | 2 +- spacy/hu/punctuation.py | 9 ++++++++- spacy/hu/tokenizer_exceptions.py | 4 ++-- spacy/language_data/punctuation.py | 2 +- spacy/tests/hu/test_tokenizer.py | 8 +++++--- 5 files changed, 17 insertions(+), 8 deletions(-) diff --git a/spacy/hu/language_data.py b/spacy/hu/language_data.py index b4cadda16..57b473527 100644 --- a/spacy/hu/language_data.py +++ b/spacy/hu/language_data.py @@ -15,7 +15,7 @@ update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(OTHER_EXC)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ABBREVIATIONS)) -TOKENIZER_PREFIXES = base.TOKENIZER_PREFIXES +TOKENIZER_PREFIXES = TOKENIZER_PREFIXES TOKENIZER_SUFFIXES = TOKENIZER_SUFFIXES TOKENIZER_INFIXES = TOKENIZER_INFIXES diff --git a/spacy/hu/punctuation.py b/spacy/hu/punctuation.py index 13d348666..af0c2b559 100644 --- a/spacy/hu/punctuation.py +++ b/spacy/hu/punctuation.py @@ -6,6 +6,13 @@ from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPH CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿" +TOKENIZER_PREFIXES = ( + [r'\+'] + + LIST_PUNCT + + LIST_ELLIPSES + + LIST_QUOTES +) + TOKENIZER_SUFFIXES = ( LIST_PUNCT + LIST_ELLIPSES + @@ -32,4 +39,4 @@ TOKENIZER_INFIXES = ( ] ) -__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] +__all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py index f328e1d26..9153d43f0 100644 --- a/spacy/hu/tokenizer_exceptions.py +++ b/spacy/hu/tokenizer_exceptions.py @@ -539,13 +539,13 @@ OTHER_EXC = """ ORD_NUM_OR_DATE = "([A-Z0-9]+[./-])*(\d+\.?)" _NUM = "[+\-]?\d+([,.]\d+)*" _OPS = "[=<>+\-\*/^()÷%²]" -_SUFFIES = "-[{a}]+".format(a=ALPHA_LOWER) +_SUFFIXES = "-[{a}]+".format(a=ALPHA_LOWER) NUMERIC_EXP = "({n})(({o})({n}))*[%]?".format(n=_NUM, o=_OPS) TIME_EXP = "\d+(:\d+)*(\.\d+)?" NUMS = "(({ne})|({t})|({on})|({c}))({s})?".format( ne=NUMERIC_EXP, t=TIME_EXP, on=ORD_NUM_OR_DATE, - c=CURRENCY, s=_SUFFIES + c=CURRENCY, s=_SUFFIXES ) TOKEN_MATCH = re.compile("^({u})|({n})$".format(u=_URL_PATTERN, n=NUMS)).match diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index a7244ab19..cb925c26d 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -58,7 +58,7 @@ LIST_HYPHENS = list(_HYPHENS.strip().split()) ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '').replace('\n', '') -ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace(' ', '') +ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace('\n', '') ALPHA = ALPHA_LOWER + ALPHA_UPPER diff --git a/spacy/tests/hu/test_tokenizer.py b/spacy/tests/hu/test_tokenizer.py index e091f1914..4536d6658 100644 --- a/spacy/tests/hu/test_tokenizer.py +++ b/spacy/tests/hu/test_tokenizer.py @@ -29,7 +29,7 @@ HYPHEN_TESTS = [ ('Dinnye-domb-.', ['Dinnye-domb-', '.']), ('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']), ('Lakik-e', ['Lakik', '-e']), - ('A--B', ['A', '--' 'B']), + ('A--B', ['A', '--', 'B']), ('Lakik-e?', ['Lakik', '-e', '?']), ('Lakik-e.', ['Lakik', '-e', '.']), ('Lakik-e...', ['Lakik', '-e', '...']), @@ -42,6 +42,7 @@ HYPHEN_TESTS = [ ('A 7-es.', ['A', '7-es', '.']), ('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']), ('A %-sal.', ['A', '%-sal', '.']), + ('A $-sal.', ['A', '$-sal', '.']), ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.']) ] @@ -206,7 +207,7 @@ NUMBER_TESTS = [ ('A 5$-ban', ['A', '5$-ban']), ('A 5$.', ['A', '5', '$', '.']), ('A 5$', ['A', '5', '$']), - ('A $5', ['A', '$', '5']), + ('A $5', ['A', '$5']), ('A 5km/h', ['A', '5', 'km/h']), ('A 75%+1-100%-ig', ['A', '75%+1-100%-ig']), ('A 5km/h.', ['A', '5', 'km/h', '.']), @@ -247,7 +248,8 @@ WIKI_TESTS = [ ('"(...)"–sokkal ', ['"', '(', '...', ')', '"', '–sokkal']), ] -TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS # + HYPHEN_TESTS # + WIKI_TESTS +TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS # + WIKI_TESTS + @pytest.mark.parametrize('text,expected_tokens', TESTCASES) def test_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):