Ordinal numbers for Turkish (#6142)

* minor ordinal number addition

* fixed typo

* added corresponding lexical test
This commit is contained in:
Duygu Altinok 2020-10-07 10:25:37 +02:00 committed by GitHub
parent 1a00bff06d
commit b95a11dd95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 75 additions and 1 deletions

View File

@ -35,6 +35,36 @@ _num_words = [
] ]
_ordinal_words = [
"birinci",
"ikinci",
"üçüncü",
"dördüncü",
"beşinci",
"altıncı",
"yedinci",
"sekizinci",
"dokuzuncu",
"onuncu",
"yirminci",
"otuzuncu",
"kırkıncı",
"ellinci",
"altmışıncı",
"yetmişinci",
"sekseninci",
"doksanıncı",
"yüzüncü",
"bininci",
"mliyonuncu",
"milyarıncı",
"trilyonuncu",
"katrilyonuncu",
"kentilyonuncu",
]
_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
def like_num(text): def like_num(text):
if text.startswith(("+", "-", "±", "~")): if text.startswith(("+", "-", "±", "~")):
text = text[1:] text = text[1:]
@ -45,8 +75,20 @@ def like_num(text):
num, denom = text.split("/") num, denom = text.split("/")
if num.isdigit() and denom.isdigit(): if num.isdigit() and denom.isdigit():
return True return True
if text.lower() in _num_words:
text_lower = text.lower()
#Check cardinal number
if text_lower in _num_words:
return True return True
#Check ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith(_ordinal_endings):
if text_lower[:-3].isdigit() or text_lower[:-4].isdigit():
return True
return False return False

View File

@ -0,0 +1,32 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.tr.lex_attrs import like_num
@pytest.mark.parametrize(
"word",
[
"bir",
"iki",
"dört",
"altı",
"milyon",
"100",
"birinci",
"üçüncü",
"beşinci",
"100üncü",
"8inci"
]
)
def test_tr_lex_attrs_like_number_cardinal_ordinal(word):
assert like_num(word)
@pytest.mark.parametrize("word", ["beş", "yedi", "yedinci", "birinci"])
def test_tr_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())