diff --git a/spacy/hu/punctuation.py b/spacy/hu/punctuation.py index af0c2b559..ca1656a18 100644 --- a/spacy/hu/punctuation.py +++ b/spacy/hu/punctuation.py @@ -35,8 +35,7 @@ TOKENIZER_INFIXES = ( r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), - r'(?<=[0-9{a}])({q})(?=[\-{a}])'.format(a=ALPHA, q=QUOTES), + r'(?<=[0-9{a}])(({q})|[\)\]])(?=\-[{a}])'.format(a=ALPHA, q=QUOTES), ] ) - __all__ = ["TOKENIZER_PREFIXES", "TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] diff --git a/spacy/tests/hu/test_tokenizer.py b/spacy/tests/hu/test_tokenizer.py index 4536d6658..e4d40c195 100644 --- a/spacy/tests/hu/test_tokenizer.py +++ b/spacy/tests/hu/test_tokenizer.py @@ -248,7 +248,7 @@ WIKI_TESTS = [ ('"(...)"–sokkal ', ['"', '(', '...', ')', '"', '–sokkal']), ] -TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS # + WIKI_TESTS +TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS @pytest.mark.parametrize('text,expected_tokens', TESTCASES)