💫 Make like_num work for prefixed numbers (#2808)

* Only split + prefix if not numbers * Make like_num work for prefixed numbers * Add test for like_num
2025-12-16 14:44:19 +03:00 · 2018-10-01 10:49:14 +02:00 · 2018-10-01 10:49:14 +02:00 · ea20b72c08
commit ea20b72c08
parent b39810d692
19 changed files with 61 additions and 12 deletions
--- a/spacy/lang/ar/lex_attrs.py
+++ b/spacy/lang/ar/lex_attrs.py
@ -76,6 +76,8 @@ def like_num(text):
    """
    check if text resembles a number
    """
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/da/lex_attrs.py
+++ b/spacy/lang/da/lex_attrs.py
@ -34,6 +34,8 @@ enoghalvfemsindstyvende tooghalvfemsindstyvende treoghalvfemsindstyvende fireogh
 """.split()
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/el/lex_attrs.py
+++ b/spacy/lang/el/lex_attrs.py
@ -21,6 +21,8 @@ _num_words = ['μηδέν', 'ένας', 'δυο', 'δυό', 'τρεις', 'τέ
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/en/lex_attrs.py
+++ b/spacy/lang/en/lex_attrs.py
@ -13,6 +13,8 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/fi/lex_attrs.py
+++ b/spacy/lang/fi/lex_attrs.py
@ -10,6 +10,8 @@ _num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'se
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace('.', '').replace(',', '')
    if text.isdigit():
        return True
--- a/spacy/lang/fr/lex_attrs.py
+++ b/spacy/lang/fr/lex_attrs.py
@ -24,6 +24,8 @@ sextillionnième septillionnième octillionnième nonillionnième decillionnièm
 def like_num(text):
    # Might require more work?
    # See this discussion: https://github.com/explosion/spaCy/pull/1161
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/hi/lex_attrs.py
+++ b/spacy/lang/hi/lex_attrs.py
@ -40,6 +40,8 @@ def norm(string):
    return string
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/id/lex_attrs.py
+++ b/spacy/lang/id/lex_attrs.py
@ -11,6 +11,8 @@ _num_words = ['nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@ -38,6 +38,8 @@ def is_ascii(text):
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    # can be overwritten by lang with list of number words
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
--- a/spacy/lang/nl/lex_attrs.py
+++ b/spacy/lang/nl/lex_attrs.py
@ -23,6 +23,8 @@ def like_num(text):
    # or matches one of the number words. In order to handle numbers like
    # "drieëntwintig", more work is required.
    # See this discussion: https://github.com/explosion/spaCy/pull/1177
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/pt/lex_attrs.py
+++ b/spacy/lang/pt/lex_attrs.py
@ -21,6 +21,8 @@ _ordinal_words = ['primeiro', 'segundo', 'terceiro', 'quarto', 'quinto', 'sexto'
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -6,8 +6,8 @@ from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
 from .char_classes import QUOTES, CURRENCY, UNITS
-_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_prefixes = (['§', '%', '=', r'\+(?![0-9])'] + LIST_PUNCT + LIST_ELLIPSES +
-             LIST_CURRENCY + LIST_ICONS)
+             LIST_QUOTES + LIST_CURRENCY + LIST_ICONS)
 _suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
--- a/spacy/lang/ro/lex_attrs.py
+++ b/spacy/lang/ro/lex_attrs.py
@ -23,6 +23,8 @@ miilea mielea mia milionulea milioana miliardulea miliardelea miliarda enșpemia
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/ru/lex_attrs.py
+++ b/spacy/lang/ru/lex_attrs.py
@ -18,6 +18,8 @@ _num_words = [
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/tr/lex_attrs.py
+++ b/spacy/lang/tr/lex_attrs.py
@ -13,6 +13,8 @@ _num_words = ['bir', 'iki', 'üç', 'dört', 'beş', 'altı', 'yedi', 'sekiz',
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
@ -28,4 +30,3 @@ def like_num(text):
 LEX_ATTRS = {
    LIKE_NUM: like_num
 }
--- a/spacy/lang/tt/lex_attrs.py
+++ b/spacy/lang/tt/lex_attrs.py
@ -12,6 +12,8 @@ _num_words = ['нуль', 'ноль', 'бер', 'ике', 'өч', 'дүрт', '
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/ur/lex_attrs.py
+++ b/spacy/lang/ur/lex_attrs.py
@ -29,6 +29,8 @@ _ordinal_words = """پہلا دوسرا تیسرا چوتھا پانچواں چ
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/lang/vi/lex_attrs.py
+++ b/spacy/lang/vi/lex_attrs.py
@ -9,6 +9,8 @@ _num_words = ['không', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bẩy',
 def like_num(text):
    if text.startswith(('+', '-', '±', '~')):
        text = text[1:]
    text = text.replace(',', '').replace('.', '')
    if text.isdigit():
        return True
--- a/spacy/tests/regression/test_issue2782.py
+++ b/spacy/tests/regression/test_issue2782.py
@ -0,0 +1,16 @@
 # coding: utf8
 from __future__ import unicode_literals
 from spacy.util import get_lang_class
 import pytest
@pytest.mark.parametrize('text', ['-0.23', '+123,456', '±1'])
@pytest.mark.parametrize('lang', ['en', 'xx'])
 def test_issue2782(text, lang):
    """Check that like_num handles + and - before number."""
    cls = get_lang_class(lang)
    nlp = cls()
    doc = nlp(text)
    assert len(doc) == 1
    assert doc[0].like_num