From b4df202bfac5344a7d8857e68f0b6f2fea5e5348 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Sat, 14 Jan 2017 22:24:58 +0100 Subject: [PATCH] Better error handling --- spacy/hu/punctuation.py | 4 ++-- spacy/hu/tokenizer_exceptions.py | 6 +----- spacy/tests/hu/test_tokenizer.py | 3 ++- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/spacy/hu/punctuation.py b/spacy/hu/punctuation.py index 777a6af82..945cc170e 100644 --- a/spacy/hu/punctuation.py +++ b/spacy/hu/punctuation.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ..language_data.punctuation import ALPHA_LOWER, LIST_ELLIPSES, QUOTES, ALPHA_UPPER, LIST_QUOTES, UNITS, \ - CURRENCY, LIST_PUNCT, ALPHA + CURRENCY, LIST_PUNCT, ALPHA, _QUOTES CURRENCY_SYMBOLS = r"\$ ¢ £ € ¥ ฿" @@ -35,7 +35,7 @@ TOKENIZER_INFIXES = ( r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[{a}])(({q})|[\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=QUOTES), + r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_QUOTES.replace("'", "").strip().replace(" ", "")), ] ) __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py index 4d9b6d855..f1fb99469 100644 --- a/spacy/hu/tokenizer_exceptions.py +++ b/spacy/hu/tokenizer_exceptions.py @@ -63,7 +63,6 @@ EU. F. Fla. Folyt. -Ford. Fpk. Főszerk. G. @@ -184,7 +183,6 @@ Ty. Tyr. U. Ui. -User. Ut. V. VB. @@ -261,7 +259,6 @@ ea. ed. eff. egyh. -el. ell. elv. elvt. @@ -349,7 +346,7 @@ io. ip. ir. irod. -is. +irod. isk. ism. izr. @@ -604,7 +601,6 @@ zs. ált. ápr. ásv. -át. é. ék. ény. diff --git a/spacy/tests/hu/test_tokenizer.py b/spacy/tests/hu/test_tokenizer.py index e77579ede..5dd802183 100644 --- a/spacy/tests/hu/test_tokenizer.py +++ b/spacy/tests/hu/test_tokenizer.py @@ -36,6 +36,7 @@ HYPHEN_TESTS = [ ('Lakik-e... van.', ['Lakik', '-e', '...', 'van', '.']), ('Lakik-e van?', ['Lakik', '-e', 'van', '?']), ('Lakik-elem van?', ['Lakik-elem', 'van', '?']), + ('Az életbiztosításáról- egy.', ['Az', 'életbiztosításáról-', 'egy', '.']), ('Van lakik-elem.', ['Van', 'lakik-elem', '.']), ('A 7-es busz?', ['A', '7-es', 'busz', '?']), ('A 7-es?', ['A', '7-es', '?']), @@ -218,7 +219,7 @@ QUOTE_TESTS = [ ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), ('Egy 24"-os monitor.', ['Egy', '24"-os', 'monitor', '.']), - # ("A don't van.", ['A', "don't", 'van', '.']) + ("A McDonald's van.", ['A', "McDonald's", 'van', '.']) ] DOT_TESTS = [