💫 Make like_num work for prefixed numbers (#2808)

* Only split + prefix if not numbers

* Make like_num work for prefixed numbers

* Add test for like_num
This commit is contained in:
Ines Montani 2018-10-01 10:49:14 +02:00 committed by Matthew Honnibal
parent b39810d692
commit ea20b72c08
19 changed files with 61 additions and 12 deletions

View File

@ -76,6 +76,8 @@ def like_num(text):
"""
check if text resembles a number
"""
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -34,6 +34,8 @@ enoghalvfemsindstyvende tooghalvfemsindstyvende treoghalvfemsindstyvende fireogh
""".split()
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -21,6 +21,8 @@ _num_words = ['μηδέν', 'ένας', 'δυο', 'δυό', 'τρεις', 'τέ
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -13,6 +13,8 @@ _num_words = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -10,6 +10,8 @@ _num_words = ['nolla', 'yksi', 'kaksi', 'kolme', 'neljä', 'viisi', 'kuusi', 'se
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace('.', '').replace(',', '')
if text.isdigit():
return True

View File

@ -24,6 +24,8 @@ sextillionnième septillionnième octillionnième nonillionnième decillionnièm
def like_num(text):
# Might require more work?
# See this discussion: https://github.com/explosion/spaCy/pull/1161
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -40,6 +40,8 @@ def norm(string):
return string
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -11,6 +11,8 @@ _num_words = ['nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh',
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -38,6 +38,8 @@ def is_ascii(text):
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
# can be overwritten by lang with list of number words
text = text.replace(',', '').replace('.', '')
if text.isdigit():

View File

@ -23,6 +23,8 @@ def like_num(text):
# or matches one of the number words. In order to handle numbers like
# "drieëntwintig", more work is required.
# See this discussion: https://github.com/explosion/spaCy/pull/1177
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -21,6 +21,8 @@ _ordinal_words = ['primeiro', 'segundo', 'terceiro', 'quarto', 'quinto', 'sexto'
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -6,8 +6,8 @@ from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
from .char_classes import QUOTES, CURRENCY, UNITS
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
LIST_CURRENCY + LIST_ICONS)
_prefixes = (['§', '%', '=', r'\+(?![0-9])'] + LIST_PUNCT + LIST_ELLIPSES +
LIST_QUOTES + LIST_CURRENCY + LIST_ICONS)
_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +

View File

@ -23,6 +23,8 @@ miilea mielea mia milionulea milioana miliardulea miliardelea miliarda enșpemia
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -18,6 +18,8 @@ _num_words = [
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -13,6 +13,8 @@ _num_words = ['bir', 'iki', 'üç', 'dört', 'beş', 'altı', 'yedi', 'sekiz',
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
@ -28,4 +30,3 @@ def like_num(text):
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -12,6 +12,8 @@ _num_words = ['нуль', 'ноль', 'бер', 'ике', 'өч', 'дүрт', '
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -29,6 +29,8 @@ _ordinal_words = """پہلا دوسرا تیسرا چوتھا پانچواں چ
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -9,6 +9,8 @@ _num_words = ['không', 'một', 'hai', 'ba', 'bốn', 'năm', 'sáu', 'bẩy',
def like_num(text):
if text.startswith(('+', '-', '±', '~')):
text = text[1:]
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True

View File

@ -0,0 +1,16 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.util import get_lang_class
import pytest
@pytest.mark.parametrize('text', ['-0.23', '+123,456', '±1'])
@pytest.mark.parametrize('lang', ['en', 'xx'])
def test_issue2782(text, lang):
"""Check that like_num handles + and - before number."""
cls = get_lang_class(lang)
nlp = cls()
doc = nlp(text)
assert len(doc) == 1
assert doc[0].like_num