English: adds ordinal numbers (#5830)

This commit is contained in:
Rahul Gupta 2020-07-29 23:52:47 +05:30 committed by GitHub
parent 90b958fd01
commit f76fae0e8d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 62 additions and 1 deletions

View File

@ -44,6 +44,44 @@ _num_words = [
]
_ordinal_words = [
"first",
"second",
"third",
"fourth",
"fifth",
"sixth",
"seventh",
"eighth",
"ninth",
"tenth",
"eleventh",
"twelfth",
"thirteenth",
"fourteenth",
"fifteenth",
"sixteenth",
"seventeenth",
"eighteenth",
"nineteenth",
"twentieth",
"thirtieth",
"fortieth",
"fiftieth",
"sixtieth",
"seventieth",
"eightieth",
"ninetieth",
"hundredth",
"thousandth",
"millionth",
"billionth",
"trillionth",
"quadrillionth",
"gajillionth",
"bazillionth",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
@ -54,8 +92,18 @@ def like_num(text):
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text.lower() in _num_words:
text_lower = text.lower()
if text_lower in _num_words:
return True
# CHeck ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith("th"):
if text_lower[:-2].isdigit():
return True
return False

View File

@ -61,6 +61,19 @@ def test_lex_attrs_like_number(en_tokenizer, text, match):
assert tokens[0].like_num == match
@pytest.mark.parametrize(
"word",
[
"third",
"Millionth",
"100th",
"Hundredth",
]
)
def test_en_lex_attrs_like_number_for_ordinal(word):
assert like_num(word)
@pytest.mark.parametrize("word", ["eleven"])
def test_en_lex_attrs_capitals(word):
assert like_num(word)