From a8e58e04efc5b57a2425595eaf1e049c23a37352 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 27 May 2017 17:57:10 +0200
Subject: [PATCH] Add symbols class to punctuation rules to handle emoji (see
 #1088)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently doesn't work for Hungarian, because of conflicts with the
custom punctuation rules. Also doesn't take multi-character emoji like
👩🏽‍💻 into account.
---
 spacy/lang/bn/punctuation.py             | 10 +++++-----
 spacy/lang/char_classes.py               |  5 +++--
 spacy/lang/punctuation.py                | 11 ++++++-----
 spacy/tests/tokenizer/test_exceptions.py | 12 +++++++++---
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py
index 66b7d967c..96485dd55 100644
--- a/spacy/lang/bn/punctuation.py
+++ b/spacy/lang/bn/punctuation.py
@@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
-from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
 
 
 _currency = r"\$|¢|£|€|¥|฿|৳"
@@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
 _list_punct = LIST_PUNCT + '। ॥'.strip().split()
 
 
-_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
+_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
 
-_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
              [r'(?<=[0-9])\+',
               r'(?<=°[FfCcKk])\.',
               r'(?<=[0-9])(?:{})'.format(_currency),
               r'(?<=[0-9])(?:{})'.format(UNITS),
               r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
 
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 5b81eddde..bec685646 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -20,7 +20,6 @@ _upper = [_latin_upper]
 _lower = [_latin_lower]
 _uncased = [_bengali, _hebrew]
 
-
 ALPHA = merge_char_classes(_upper + _lower + _uncased)
 ALPHA_LOWER = merge_char_classes(_lower + _uncased)
 ALPHA_UPPER = merge_char_classes(_upper + _uncased)
@@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
 _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
 _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
 _hyphens = '- – — -- ---'
-
+_other_symbols = r'[\p{So}]'
 
 UNITS = merge_chars(_units)
 CURRENCY = merge_chars(_currency)
 QUOTES = merge_chars(_quotes)
 PUNCT = merge_chars(_punct)
 HYPHENS = merge_chars(_hyphens)
+ICONS = _other_symbols
 
 LIST_UNITS = split_chars(_units)
 LIST_CURRENCY = split_chars(_currency)
@@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
 LIST_PUNCT = split_chars(_punct)
 LIST_HYPHENS = split_chars(_hyphens)
 LIST_ELLIPSES = [r'\.\.+', '…']
+LIST_ICONS = [_other_symbols]
diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
index 74bb28f5f..680f5cff0 100644
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@@ -2,15 +2,16 @@
 from __future__ import unicode_literals
 
 from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
-from .char_classes import CURRENCY, UNITS
+from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
+from .char_classes import QUOTES, CURRENCY, UNITS
 
 
 _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
-             LIST_CURRENCY)
+             LIST_CURRENCY + LIST_ICONS)
 
 
-_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
+             ["'s", "'S", "’s", "’S"] +
              [r'(?<=[0-9])\+',
               r'(?<=°[FfCcKk])\.',
               r'(?<=[0-9])(?:{})'.format(CURRENCY),
@@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
               r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
 
 
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
             [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
              r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
index aab27714e..70fb103dc 100644
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@@ -1,7 +1,4 @@
 # coding: utf-8
-"""Test that tokenizer exceptions and emoticons are handled correctly."""
-
-
 from __future__ import unicode_literals
 
 import pytest
@@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
 def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
     tokens = tokenizer(text)
     assert len(tokens) == length
+
+
+@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
+                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
+def test_tokenizer_handles_emoji(tokenizer, text, length):
+    exceptions = ["hu"]
+    tokens = tokenizer(text)
+    if tokens[0].lang_ not in exceptions:
+        assert len(tokens) == length