Fixed emoji handling for Hungarian

2026-01-03 15:33:33 +03:00 · 2017-05-30 21:34:46 +02:00 · 2017-05-30 21:34:46 +02:00 · 8c0b4b850e
commit 8c0b4b850e
parent f86289566a
2 changed files with 6 additions and 11 deletions
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@ -1,18 +1,17 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..punctuation import TOKENIZER_INFIXES
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER

+LIST_ICONS = [r'[\p{So}--[°]]']

 _currency = r'\$|¢|£|€|¥|฿'
 _quotes = QUOTES.replace("'", '')

+_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)

-_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES)
-
-_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(_currency),
@ -20,8 +19,7 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
              r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency),
              r'(?<=[{})])-e'.format(ALPHA_LOWER)])

-
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
@ -29,7 +27,6 @@ _infixes = (LIST_ELLIPSES +
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])

-
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -41,7 +41,5 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
 def test_tokenizer_handles_emoji(tokenizer, text, length):
-    exceptions = ["hu"]
    tokens = tokenizer(text)
-    if tokens[0].lang_ not in exceptions:
-        assert len(tokens) == length
+    assert len(tokens) == length