From a8e58e04efc5b57a2425595eaf1e049c23a37352 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 17:57:10 +0200 Subject: [PATCH] Add symbols class to punctuation rules to handle emoji (see #1088) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently doesn't work for Hungarian, because of conflicts with the custom punctuation rules. Also doesn't take multi-character emoji like 👩🏽‍💻 into account. --- spacy/lang/bn/punctuation.py | 10 +++++----- spacy/lang/char_classes.py | 5 +++-- spacy/lang/punctuation.py | 11 ++++++----- spacy/tests/tokenizer/test_exceptions.py | 12 +++++++++--- 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py index 66b7d967c..96485dd55 100644 --- a/spacy/lang/bn/punctuation.py +++ b/spacy/lang/bn/punctuation.py @@ -1,8 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS -from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS _currency = r"\$|¢|£|€|¥|฿|৳" @@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '') _list_punct = LIST_PUNCT + '। ॥'.strip().split() -_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES) +_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS) -_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + [r'(?<=[0-9])\+', r'(?<=°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(_currency), r'(?<=[0-9])(?:{})'.format(UNITS), r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 5b81eddde..bec685646 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -20,7 +20,6 @@ _upper = [_latin_upper] _lower = [_latin_lower] _uncased = [_bengali, _hebrew] - ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased) ALPHA_UPPER = merge_char_classes(_upper + _uncased) @@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' _punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «' _hyphens = '- – — -- ---' - +_other_symbols = r'[\p{So}]' UNITS = merge_chars(_units) CURRENCY = merge_chars(_currency) QUOTES = merge_chars(_quotes) PUNCT = merge_chars(_punct) HYPHENS = merge_chars(_hyphens) +ICONS = _other_symbols LIST_UNITS = split_chars(_units) LIST_CURRENCY = split_chars(_currency) @@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes) LIST_PUNCT = split_chars(_punct) LIST_HYPHENS = split_chars(_hyphens) LIST_ELLIPSES = [r'\.\.+', '…'] +LIST_ICONS = [_other_symbols] diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 74bb28f5f..680f5cff0 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -2,15 +2,16 @@ from __future__ import unicode_literals from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY -from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES -from .char_classes import CURRENCY, UNITS +from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS +from .char_classes import QUOTES, CURRENCY, UNITS _prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + - LIST_CURRENCY) + LIST_CURRENCY + LIST_ICONS) -_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + + ["'s", "'S", "’s", "’S"] + [r'(?<=[0-9])\+', r'(?<=°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(CURRENCY), @@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[0-9])[+\-\*^](?=[0-9-])', r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index aab27714e..70fb103dc 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -1,7 +1,4 @@ # coding: utf-8 -"""Test that tokenizer exceptions and emoticons are handled correctly.""" - - from __future__ import unicode_literals import pytest @@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer): def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length + + +@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), + ('i💙you', 3), ('🤘🤘yay!', 4)]) +def test_tokenizer_handles_emoji(tokenizer, text, length): + exceptions = ["hu"] + tokens = tokenizer(text) + if tokens[0].lang_ not in exceptions: + assert len(tokens) == length