mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Add symbols class to punctuation rules to handle emoji (see #1088)
Currently doesn't work for Hungarian, because of conflicts with the
custom punctuation rules. Also doesn't take multi-character emoji like
👩🏽💻 into account.
This commit is contained in:
parent
dc07d72d80
commit
a8e58e04ef
|
@ -1,8 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS
|
||||||
|
|
||||||
|
|
||||||
_currency = r"\$|¢|£|€|¥|฿|৳"
|
_currency = r"\$|¢|£|€|¥|฿|৳"
|
||||||
|
@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '')
|
||||||
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
_list_punct = LIST_PUNCT + '। ॥'.strip().split()
|
||||||
|
|
||||||
|
|
||||||
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES)
|
_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
|
||||||
|
|
||||||
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES +
|
_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
[r'(?<=[0-9])\+',
|
[r'(?<=[0-9])\+',
|
||||||
r'(?<=°[FfCcKk])\.',
|
r'(?<=°[FfCcKk])\.',
|
||||||
r'(?<=[0-9])(?:{})'.format(_currency),
|
r'(?<=[0-9])(?:{})'.format(_currency),
|
||||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
r'(?<=[0-9])(?:{})'.format(UNITS),
|
||||||
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%²\-\)\]\+', QUOTES]), _currency)])
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES +
|
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||||
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
|
||||||
|
|
|
@ -20,7 +20,6 @@ _upper = [_latin_upper]
|
||||||
_lower = [_latin_lower]
|
_lower = [_latin_lower]
|
||||||
_uncased = [_bengali, _hebrew]
|
_uncased = [_bengali, _hebrew]
|
||||||
|
|
||||||
|
|
||||||
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||||||
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
||||||
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||||||
|
@ -33,13 +32,14 @@ _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
||||||
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
_punct = r'… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &'
|
||||||
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‚ , „ » «'
|
||||||
_hyphens = '- – — -- ---'
|
_hyphens = '- – — -- ---'
|
||||||
|
_other_symbols = r'[\p{So}]'
|
||||||
|
|
||||||
UNITS = merge_chars(_units)
|
UNITS = merge_chars(_units)
|
||||||
CURRENCY = merge_chars(_currency)
|
CURRENCY = merge_chars(_currency)
|
||||||
QUOTES = merge_chars(_quotes)
|
QUOTES = merge_chars(_quotes)
|
||||||
PUNCT = merge_chars(_punct)
|
PUNCT = merge_chars(_punct)
|
||||||
HYPHENS = merge_chars(_hyphens)
|
HYPHENS = merge_chars(_hyphens)
|
||||||
|
ICONS = _other_symbols
|
||||||
|
|
||||||
LIST_UNITS = split_chars(_units)
|
LIST_UNITS = split_chars(_units)
|
||||||
LIST_CURRENCY = split_chars(_currency)
|
LIST_CURRENCY = split_chars(_currency)
|
||||||
|
@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes)
|
||||||
LIST_PUNCT = split_chars(_punct)
|
LIST_PUNCT = split_chars(_punct)
|
||||||
LIST_HYPHENS = split_chars(_hyphens)
|
LIST_HYPHENS = split_chars(_hyphens)
|
||||||
LIST_ELLIPSES = [r'\.\.+', '…']
|
LIST_ELLIPSES = [r'\.\.+', '…']
|
||||||
|
LIST_ICONS = [_other_symbols]
|
||||||
|
|
|
@ -2,15 +2,16 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||||
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||||
from .char_classes import CURRENCY, UNITS
|
from .char_classes import QUOTES, CURRENCY, UNITS
|
||||||
|
|
||||||
|
|
||||||
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||||
LIST_CURRENCY)
|
LIST_CURRENCY + LIST_ICONS)
|
||||||
|
|
||||||
|
|
||||||
_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
|
["'s", "'S", "’s", "’S"] +
|
||||||
[r'(?<=[0-9])\+',
|
[r'(?<=[0-9])\+',
|
||||||
r'(?<=°[FfCcKk])\.',
|
r'(?<=°[FfCcKk])\.',
|
||||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
||||||
|
@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU
|
||||||
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
|
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
|
||||||
|
|
||||||
|
|
||||||
_infixes = (LIST_ELLIPSES +
|
_infixes = (LIST_ELLIPSES + LIST_ICONS +
|
||||||
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
||||||
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||||
|
|
|
@ -1,7 +1,4 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""Test that tokenizer exceptions and emoticons are handled correctly."""
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer):
|
||||||
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
||||||
|
('i💙you', 3), ('🤘🤘yay!', 4)])
|
||||||
|
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||||
|
exceptions = ["hu"]
|
||||||
|
tokens = tokenizer(text)
|
||||||
|
if tokens[0].lang_ not in exceptions:
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
Loading…
Reference in New Issue
Block a user