mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
💫 Make like_num work for prefixed numbers (#2808)
* Only split + prefix if not numbers * Make like_num work for prefixed numbers * Add test for like_num
This commit is contained in:
parent
b39810d692
commit
ea20b72c08
|
@ -76,6 +76,8 @@ def like_num(text):
|
||||||
"""
|
"""
|
||||||
check if text resembles a number
|
check if text resembles a number
|
||||||
"""
|
"""
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -34,6 +34,8 @@ enoghalvfemsindstyvende tooghalvfemsindstyvende treoghalvfemsindstyvende fireogh
|
||||||
""".split()
|
""".split()
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -21,6 +21,8 @@ _num_words = ['μηδέν', 'ένας', 'δυο', 'δυό', 'τρεις', 'τέ
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -13,6 +13,8 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -10,6 +10,8 @@ _num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'se
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace('.', '').replace(',', '')
|
text = text.replace('.', '').replace(',', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -24,6 +24,8 @@ sextillionnième septillionnième octillionnième nonillionnième decillionnièm
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
# Might require more work?
|
# Might require more work?
|
||||||
# See this discussion: https://github.com/explosion/spaCy/pull/1161
|
# See this discussion: https://github.com/explosion/spaCy/pull/1161
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -40,6 +40,8 @@ def norm(string):
|
||||||
return string
|
return string
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -11,6 +11,8 @@ _num_words = ['nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -38,6 +38,8 @@ def is_ascii(text):
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
# can be overwritten by lang with list of number words
|
# can be overwritten by lang with list of number words
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
|
|
|
@ -23,6 +23,8 @@ def like_num(text):
|
||||||
# or matches one of the number words. In order to handle numbers like
|
# or matches one of the number words. In order to handle numbers like
|
||||||
# "drieëntwintig", more work is required.
|
# "drieëntwintig", more work is required.
|
||||||
# See this discussion: https://github.com/explosion/spaCy/pull/1177
|
# See this discussion: https://github.com/explosion/spaCy/pull/1177
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -21,6 +21,8 @@ _ordinal_words = ['primeiro', 'segundo', 'terceiro', 'quarto', 'quinto', 'sexto'
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -6,8 +6,8 @@ from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||||
from .char_classes import QUOTES, CURRENCY, UNITS
|
from .char_classes import QUOTES, CURRENCY, UNITS
|
||||||
|
|
||||||
|
|
||||||
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
_prefixes = (['§', '%', '=', r'\+(?![0-9])'] + LIST_PUNCT + LIST_ELLIPSES +
|
||||||
LIST_CURRENCY + LIST_ICONS)
|
LIST_QUOTES + LIST_CURRENCY + LIST_ICONS)
|
||||||
|
|
||||||
|
|
||||||
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
|
||||||
|
|
|
@ -23,6 +23,8 @@ miilea mielea mia milionulea milioana miliardulea miliardelea miliarda enșpemia
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -18,6 +18,8 @@ _num_words = [
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -13,6 +13,8 @@ _num_words = ['bir', 'iki', 'üç', 'dört', 'beş', 'altı', 'yedi', 'sekiz',
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
@ -28,4 +30,3 @@ def like_num(text):
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {
|
||||||
LIKE_NUM: like_num
|
LIKE_NUM: like_num
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,8 @@ _num_words = ['нуль', 'ноль', 'бер', 'ике', 'өч', 'дүрт', '
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -29,6 +29,8 @@ _ordinal_words = """پہلا دوسرا تیسرا چوتھا پانچواں چ
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -9,6 +9,8 @@ _num_words = ['không', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bẩy',
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
|
if text.startswith(('+', '-', '±', '~')):
|
||||||
|
text = text[1:]
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(',', '').replace('.', '')
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
|
|
16
spacy/tests/regression/test_issue2782.py
Normal file
16
spacy/tests/regression/test_issue2782.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.util import get_lang_class
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ['-0.23', '+123,456', '±1'])
|
||||||
|
@pytest.mark.parametrize('lang', ['en', 'xx'])
|
||||||
|
def test_issue2782(text, lang):
|
||||||
|
"""Check that like_num handles + and - before number."""
|
||||||
|
cls = get_lang_class(lang)
|
||||||
|
nlp = cls()
|
||||||
|
doc = nlp(text)
|
||||||
|
assert len(doc) == 1
|
||||||
|
assert doc[0].like_num
|
Loading…
Reference in New Issue
Block a user