From 8c0b4b850ef472d27ed36b952ed33ea25de1e152 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Tue, 30 May 2017 21:34:46 +0200 Subject: [PATCH 1/2] Fixed emoji handling for Hungarian --- spacy/lang/hu/punctuation.py | 13 +++++-------- spacy/tests/tokenizer/test_exceptions.py | 4 +--- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index 27a2912e2..b758e0104 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -1,18 +1,17 @@ # coding: utf8 from __future__ import unicode_literals -from ..punctuation import TOKENIZER_INFIXES -from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER +LIST_ICONS = [r'[\p{So}--[°]]'] _currency = r'\$|¢|£|€|¥|฿' _quotes = QUOTES.replace("'", '') +_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS) -_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES) - -_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + [r'(?<=[0-9])\+', r'(?<=°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(_currency), @@ -20,8 +19,7 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency), r'(?<=[{})])-e'.format(ALPHA_LOWER)]) - -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), @@ -29,7 +27,6 @@ _infixes = (LIST_ELLIPSES + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)]) - TOKENIZER_PREFIXES = _prefixes TOKENIZER_SUFFIXES = _suffixes TOKENIZER_INFIXES = _infixes diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index 70fb103dc..57281b998 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -41,7 +41,5 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): @pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), ('i💙you', 3), ('🤘🤘yay!', 4)]) def test_tokenizer_handles_emoji(tokenizer, text, length): - exceptions = ["hu"] tokens = tokenizer(text) - if tokens[0].lang_ not in exceptions: - assert len(tokens) == length + assert len(tokens) == length From 981196c181cb12ddde69ab6ef4878f14243c194f Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 31 May 2017 11:34:31 +0200 Subject: [PATCH 2/2] Fix typo --- website/docs/usage/rule-based-matching.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index 8588729b6..71400ea55 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -408,7 +408,7 @@ p | To label the hashtags, we first need to add a new custom flag. | #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it | to the hashtag's span, and check its value via a token's - | #[+api("token#check_flag") #[code code check_flag()]] method. On each + | #[+api("token#check_flag") #[code check_flag()]] method. On each | match, we merge the hashtag and assign the flag. +code.