From b95a11dd959bf614f359c15e087d0fc1e53584cf Mon Sep 17 00:00:00 2001 From: Duygu Altinok Date: Wed, 7 Oct 2020 10:25:37 +0200 Subject: [PATCH] Ordinal numbers for Turkish (#6142) * minor ordinal number addition * fixed typo * added corresponding lexical test --- spacy/lang/tr/lex_attrs.py | 44 +++++++++++++++++++++++++++++++- spacy/tests/lang/tr/test_text.py | 32 +++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/lang/tr/test_text.py diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py index 93f26fc8e..366bda9e7 100644 --- a/spacy/lang/tr/lex_attrs.py +++ b/spacy/lang/tr/lex_attrs.py @@ -35,6 +35,36 @@ _num_words = [ ] +_ordinal_words = [ + "birinci", + "ikinci", + "üçüncü", + "dördüncü", + "beşinci", + "altıncı", + "yedinci", + "sekizinci", + "dokuzuncu", + "onuncu", + "yirminci", + "otuzuncu", + "kırkıncı", + "ellinci", + "altmışıncı", + "yetmişinci", + "sekseninci", + "doksanıncı", + "yüzüncü", + "bininci", + "mliyonuncu", + "milyarıncı", + "trilyonuncu", + "katrilyonuncu", + "kentilyonuncu", +] + +_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü") + def like_num(text): if text.startswith(("+", "-", "±", "~")): text = text[1:] @@ -45,8 +75,20 @@ def like_num(text): num, denom = text.split("/") if num.isdigit() and denom.isdigit(): return True - if text.lower() in _num_words: + + text_lower = text.lower() + + #Check cardinal number + if text_lower in _num_words: return True + + #Check ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith(_ordinal_endings): + if text_lower[:-3].isdigit() or text_lower[:-4].isdigit(): + return True + return False diff --git a/spacy/tests/lang/tr/test_text.py b/spacy/tests/lang/tr/test_text.py new file mode 100644 index 000000000..2fe638b5f --- /dev/null +++ b/spacy/tests/lang/tr/test_text.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.tr.lex_attrs import like_num + + +@pytest.mark.parametrize( + "word", + [ + "bir", + "iki", + "dört", + "altı", + "milyon", + "100", + "birinci", + "üçüncü", + "beşinci", + "100üncü", + "8inci" + ] +) +def test_tr_lex_attrs_like_number_cardinal_ordinal(word): + assert like_num(word) + + +@pytest.mark.parametrize("word", ["beş", "yedi", "yedinci", "birinci"]) +def test_tr_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) +