Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-10-17 17:24:14 +03:00 · 2017-05-31 11:43:56 +02:00 · 2017-05-31 11:43:56 +02:00 · fe28602f2e
commit fe28602f2e
parent 66af019d5d 981196c181
3 changed files with 7 additions and 12 deletions
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@ -1,18 +1,17 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..punctuation import TOKENIZER_INFIXES
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER

+LIST_ICONS = [r'[\p{So}--[°]]']

 _currency = r'\$|¢|£|€|¥|฿'
 _quotes = QUOTES.replace("'", '')

+_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)

-_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES)
-
-_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
             [r'(?<=[0-9])\+',
              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(_currency),
@ -20,8 +19,7 @@ _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
              r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency),
              r'(?<=[{})])-e'.format(ALPHA_LOWER)])

-
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
@ -29,7 +27,6 @@ _infixes = (LIST_ELLIPSES +
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])

-
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_INFIXES = _infixes
--- a/spacy/tests/tokenizer/test_exceptions.py
+++ b/spacy/tests/tokenizer/test_exceptions.py
@ -41,7 +41,5 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
                                         ('i💙you', 3), ('🤘🤘yay!', 4)])
 def test_tokenizer_handles_emoji(tokenizer, text, length):
-    exceptions = ["hu"]
    tokens = tokenizer(text)
-    if tokens[0].lang_ not in exceptions:
    assert len(tokens) == length
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@ -408,7 +408,7 @@ p
    |  To label the hashtags, we first need to add a new custom flag.
    |  #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
    |  to the hashtag's span, and check its value via a token's
-    |  #[+api("token#check_flag") #[code code check_flag()]] method. On each
+    |  #[+api("token#check_flag") #[code check_flag()]] method. On each
    |  match, we merge the hashtag and assign the flag.

 +code.