💫 Make like_num work for prefixed numbers (#2808)

* Only split + prefix if not numbers * Make like_num work for prefixed numbers * Add test for like_num
2025-12-16 06:34:20 +03:00 · 2018-10-01 10:49:14 +02:00 · 2018-10-01 10:49:14 +02:00 · ea20b72c08
commit ea20b72c08
parent b39810d692
19 changed files with 61 additions and 12 deletions
--- a/spacy/lang/ar/lex_attrs.py
+++ b/spacy/lang/ar/lex_attrs.py
@ -76,6 +76,8 @@ def like_num(text):
    """
    check if text resembles a number
    """
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/da/lex_attrs.py
+++ b/spacy/lang/da/lex_attrs.py
@ -34,6 +34,8 @@ enoghalvfemsindstyvende tooghalvfemsindstyvende treoghalvfemsindstyvende fireogh
 """.split()
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/el/lex_attrs.py
+++ b/spacy/lang/el/lex_attrs.py
@ -21,6 +21,8 @@ _num_words = ['μηδέν', 'ένας', 'δυο', 'δυό', 'τρεις', 'τέ
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/en/lex_attrs.py
+++ b/spacy/lang/en/lex_attrs.py
@ -13,6 +13,8 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/fi/lex_attrs.py
+++ b/spacy/lang/fi/lex_attrs.py
@ -10,6 +10,8 @@ _num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'se
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace('.', '').replace(',', '')
    if text.isdigit():
        return True
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@ -24,6 +24,8 @@ sextillionnième septillionnième octillionnième nonillionnième decillionnièm
 def like_num(text):
    # Might require more work?
    # See this discussion: https://github.com/explosion/spaCy/pull/1161
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@ -14,11 +14,11 @@ _stem_suffixes = [
    ["ाएंगी","ाएंगे","ाऊंगी","ाऊंगा","ाइयाँ","ाइयों","ाइयां"]
 ]
-#reference 1:https://en.wikipedia.org/wiki/Indian_numbering_system 
+#reference 1:https://en.wikipedia.org/wiki/Indian_numbering_system
 #reference 2: https://blogs.transparent.com/hindi/hindi-numbers-1-100/
-_num_words = ['शून्य', 'एक',  'दो', 'तीन', 'चार', 'पांच', 'छह', 'सात', 'आठ', 'नौ', 'दस', 
+_num_words = ['शून्य', 'एक',  'दो', 'तीन', 'चार', 'पांच', 'छह', 'सात', 'आठ', 'नौ', 'दस',
-              'ग्यारह', 'बारह', 'तेरह', 'चौदह', 'पंद्रह', 'सोलह', 'सत्रह', 'अठारह', 'उन्नीस', 
+              'ग्यारह', 'बारह', 'तेरह', 'चौदह', 'पंद्रह', 'सोलह', 'सत्रह', 'अठारह', 'उन्नीस',
              'बीस', 'तीस', 'चालीस', 'पचास', 'साठ', 'सत्तर', 'अस्सी', 'नब्बे', 'सौ', 'हज़ार',
              'लाख', 'करोड़', 'अरब', 'खरब']
@ -40,6 +40,8 @@ def norm(string):
    return string
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@ -11,6 +11,8 @@ _num_words = ['nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -38,6 +38,8 @@ def is_ascii(text):
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    # can be overwritten by lang with list of number words
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
--- a/spacy/lang/nl/lex_attrs.py
+++ b/spacy/lang/nl/lex_attrs.py
@ -23,6 +23,8 @@ def like_num(text):
    # or matches one of the number words. In order to handle numbers like
    # "drieëntwintig", more work is required.
    # See this discussion: https://github.com/explosion/spaCy/pull/1177
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/pt/lex_attrs.py
+++ b/spacy/lang/pt/lex_attrs.py
@ -21,6 +21,8 @@ _ordinal_words = ['primeiro', 'segundo', 'terceiro', 'quarto', 'quinto', 'sexto'
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -6,8 +6,8 @@ from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
 from .char_classes import QUOTES, CURRENCY, UNITS
-_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_prefixes = (['§', '%', '=', r'\+(?![0-9])'] + LIST_PUNCT + LIST_ELLIPSES +
-             LIST_CURRENCY + LIST_ICONS)
+             LIST_QUOTES + LIST_CURRENCY + LIST_ICONS)
 _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
--- a/spacy/lang/ro/lex_attrs.py
+++ b/spacy/lang/ro/lex_attrs.py
@ -23,6 +23,8 @@ miilea mielea mia milionulea milioana miliardulea miliardelea miliarda enșpemia
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/ru/lex_attrs.py
+++ b/spacy/lang/ru/lex_attrs.py
@ -18,6 +18,8 @@ _num_words = [
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@ -13,6 +13,8 @@ _num_words = ['bir', 'iki', 'üç', 'dört', 'beş', 'altı', 'yedi', 'sekiz',
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
@ -28,4 +30,3 @@ def like_num(text):
 LEX_ATTRS = {
    LIKE_NUM: like_num
 }
--- a/spacy/lang/tt/lex_attrs.py
+++ b/spacy/lang/tt/lex_attrs.py
@ -12,6 +12,8 @@ _num_words = ['нуль', 'ноль', 'бер', 'ике', 'өч', 'дүрт', '
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/ur/lex_attrs.py
+++ b/spacy/lang/ur/lex_attrs.py
@ -9,26 +9,28 @@ from ...attrs import LIKE_NUM
 # https://www.urdu-english.com/lessons/beginner/numbers
 _num_words = """ایک دو تین چار پانچ چھ سات آٹھ نو دس گیارہ بارہ تیرہ چودہ پندرہ سولہ سترہ
- اٹهارا انیس بیس اکیس بائیس تئیس چوبیس پچیس چھببیس 
+ اٹهارا انیس بیس اکیس بائیس تئیس چوبیس پچیس چھببیس
 ستایس اٹھائس انتيس تیس اکتیس بتیس تینتیس چونتیس پینتیس
- چھتیس سینتیس ارتیس انتالیس چالیس اکتالیس بیالیس تیتالیس 
+ چھتیس سینتیس ارتیس انتالیس چالیس اکتالیس بیالیس تیتالیس
 چوالیس پیتالیس چھیالیس سینتالیس اڑتالیس انچالیس پچاس اکاون باون
- تریپن چون پچپن چھپن ستاون اٹھاون انسٹھ ساثھ 
+ تریپن چون پچپن چھپن ستاون اٹھاون انسٹھ ساثھ
-اکسٹھ باسٹھ تریسٹھ چوسٹھ پیسٹھ چھیاسٹھ سڑسٹھ اڑسٹھ 
+اکسٹھ باسٹھ تریسٹھ چوسٹھ پیسٹھ چھیاسٹھ سڑسٹھ اڑسٹھ
 انھتر ستر اکھتر بھتتر تیھتر چوھتر تچھتر چھیتر ستتر
 اٹھتر انیاسی اسی اکیاسی بیاسی تیراسی چوراسی پچیاسی چھیاسی
- سٹیاسی اٹھیاسی نواسی نوے اکانوے بانوے ترانوے 
+ سٹیاسی اٹھیاسی نواسی نوے اکانوے بانوے ترانوے
 چورانوے پچانوے چھیانوے ستانوے اٹھانوے ننانوے سو
 """.split()
 # source https://www.google.com/intl/ur/inputtools/try/
 _ordinal_words = """پہلا دوسرا تیسرا چوتھا پانچواں چھٹا ساتواں آٹھواں نواں دسواں گیارہواں بارہواں تیرھواں چودھواں
- پندرھواں سولہواں سترھواں اٹھارواں انیسواں بسیواں 
+ پندرھواں سولہواں سترھواں اٹھارواں انیسواں بسیواں
 """.split()
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@ -9,6 +9,8 @@ _num_words = ['không', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bẩy',
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/tests/regression/test_issue2782.py
+++ b/spacy/tests/regression/test_issue2782.py
@ -0,0 +1,16 @@
 # coding: utf8
 from __future__ import unicode_literals
 from spacy.util import get_lang_class
 import pytest
@pytest.mark.parametrize('text', ['-0.23', '+123,456', '±1'])
@pytest.mark.parametrize('lang', ['en', 'xx'])
 def test_issue2782(text, lang):
    """Check that like_num handles + and - before number."""
    cls = get_lang_class(lang)
    nlp = cls()
    doc = nlp(text)
    assert len(doc) == 1
    assert doc[0].like_num