Add symbols class to punctuation rules to handle emoji (see #1088)

Currently doesn't work for Hungarian, because of conflicts with the custom punctuation rules. Also doesn't take multi-character emoji like 👩🏽‍💻 into account.
2025-12-27 12:03:10 +03:00 · 2017-05-27 17:57:10 +02:00 · 2017-05-27 17:57:10 +02:00 · a8e58e04ef
commit a8e58e04ef
parent dc07d72d80
4 changed files with 23 additions and 15 deletions
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS


 _currency = r"\$|¢|£|€|¥|฿|৳"
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
 _list_punct = LIST_PUNCT + '। ॥'.strip().split()


-_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
+_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)

-_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(_currency),
              r'(?<=[0-9])(?:{})'.format(UNITS),
              r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])

-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -20,7 +20,6 @@ _upper = [_latin_upper]
 _lower = [_latin_lower]
 _uncased = [_bengali, _hebrew]

-
 ALPHA = merge_char_classes(_upper + _lower + _uncased)
 ALPHA_LOWER = merge_char_classes(_lower + _uncased)
 ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
 _hyphens = '- – — -- ---'
-
+_other_symbols = r'[\p{So}]'

 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 QUOTES = merge_chars(_quotes)
 PUNCT = merge_chars(_punct)
 HYPHENS = merge_chars(_hyphens)
+ICONS = _other_symbols

 LIST_UNITS = split_chars(_units)
 LIST_CURRENCY = split_chars(_currency)
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
 LIST_PUNCT = split_chars(_punct)
 LIST_HYPHENS = split_chars(_hyphens)
 LIST_ELLIPSES = [r'\.\.+', '…']
+LIST_ICONS = [_other_symbols]
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -2,15 +2,16 @@
 from __future__ import unicode_literals

 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
-from .char_classes import CURRENCY, UNITS
+from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from .char_classes import QUOTES, CURRENCY, UNITS


 _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
-             LIST_CURRENCY)
+             LIST_CURRENCY + LIST_ICONS)


-_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+             ["'s", "'S", "’s", "’S"] +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(CURRENCY),
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
              r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])


-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
             r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -1,7 +1,4 @@
 # coding: utf-8
-"""Test that tokenizer exceptions and emoticons are handled correctly."""
-
-
 from __future__ import unicode_literals

 import pytest
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
 def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
    tokens = tokenizer(text)
    assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
+                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
+def test_tokenizer_handles_emoji(tokenizer, text, length):
+    exceptions = ["hu"]
+    tokens = tokenizer(text)
+    if tokens[0].lang_ not in exceptions:
+        assert len(tokens) == length