diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index 27a2912e2..b758e0104 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -1,18 +1,17 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_INFIXES -from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER +LIST_ICONS = [r'[\p{So}--[°]]'] _currency = r'\$|¢|£|€|¥|฿' _quotes = QUOTES.replace("'", '') +_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS) -_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES) - -_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + [r'(?<=[0-9])\+', r'(?<=°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(_currency), @@ -20,8 +19,7 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency), r'(?<=[{})])-e'.format(ALPHA_LOWER)]) - -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), @@ -29,7 +27,6 @@ _infixes = (LIST_ELLIPSES + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)]) - TOKENIZER_PREFIXES = _prefixes TOKENIZER_SUFFIXES = _suffixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index 70fb103dc..57281b998 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -41,7 +41,5 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): @pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), ('i💙you', 3), ('🤘🤘yay!', 4)]) def test_tokenizer_handles_emoji(tokenizer, text, length): - exceptions = ["hu"] tokens = tokenizer(text) - if tokens[0].lang_ not in exceptions: - assert len(tokens) == length + assert len(tokens) == length