Add lex_attrs

2025-09-19 18:42:37 +03:00 · 2024-07-10 20:25:35 +03:00 · 2024-07-10 20:25:35 +03:00 · 602304c57d
commit 602304c57d
parent 90b77cfe5e
4 changed files with 158 additions and 0 deletions
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -1,8 +1,11 @@
+from .lex_attrs import LEX_ATTRS
 from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS

+
 class KurmanjiDefaults(BaseDefaults):
    stop_words = STOP_WORDS
+    lex_attr_getters = LEX_ATTRS


 class Kurmanji(Language):
--- a/spacy/lang/kmr/lex_attrs.py
+++ b/spacy/lang/kmr/lex_attrs.py
@ -0,0 +1,139 @@
+from ...attrs import LIKE_NUM
+
+
+_num_words = [
+    "sifir",
+    "yek",
+    "du",
+    "sê",
+    "çar",
+    "pênc",
+    "şeş",
+    "heft",
+    "heşt",
+    "neh",
+    "deh",
+    "yazde",
+    "dazde",
+    "sêzde",
+    "çarde",
+    "pazde",
+    "şazde",
+    "hevde",
+    "hejde",
+    "nozde",
+    "bîst",
+    "sî",
+    "çil",
+    "pêncî",
+    "şêst",
+    "heftê",
+    "heştê",
+    "nod",
+    "sed",
+    "hezar",
+    "milyon",
+    "milyar",
+]
+
+_ordinal_words = [
+    "yekem",
+    "yekemîn",
+    "duyem",
+    "duyemîn",
+    "sêyem",
+    "sêyemîn",
+    "çarem",
+    "çaremîn",
+    "pêncem",
+    "pêncemîn",
+    "şeşem",
+    "şeşemîn",
+    "heftem",
+    "heftemîn",
+    "heştem",
+    "heştemîn",
+    "nehem",
+    "nehemîn",
+    "dehem",
+    "dehemîn",
+    "yazdehem",
+    "yazdehemîn",
+    "dazdehem",
+    "dazdehemîn",
+    "sêzdehem",
+    "sêzdehemîn",
+    "çardehem",
+    "çardehemîn",
+    "pazdehem",
+    "pazdehemîn",
+    "şanzdehem",
+    "şanzdehemîn",
+    "hevdehem",
+    "hevdehemîn",
+    "hejdehem",
+    "hejdehemîn",
+    "nozdehem",
+    "nozdehemîn",
+    "bîstem",
+    "bîstemîn",
+    "sîyem",
+    "sîyemîn",
+    "çilem",
+    "çilemîn",
+    "pêncîyem",
+    "pênciyemîn",
+    "şêstem",
+    "şêstemîn",
+    "heftêyem",
+    "heftêyemîn",
+    "heştêyem",
+    "heştêyemîn",
+    "notem",
+    "notemîn",
+    "sedem",
+    "sedemîn",
+    "hezarem",
+    "hezaremîn",
+    "milyonem",
+    "milyonemîn",
+    "milyarem",
+    "milyaremîn",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+
+    if is_digit(text_lower):
+        return True
+
+    return False
+
+
+def is_digit(text):
+    endings = ("em", "yem", "emîn", "yemîn")
+    for ending in endings:
+        to = len(ending)
+        if text.endswith(ending) and text[:-to].isdigit():
+            return True
+
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/tests/lang/kmr/init.py
+++ b/spacy/tests/lang/kmr/init.py
--- a/spacy/tests/lang/kmr/test_text.py
+++ b/spacy/tests/lang/kmr/test_text.py
@ -0,0 +1,16 @@
+import pytest
+
+from spacy.lang.kmr.lex_attrs import like_num
+
+
+@pytest.mark.parametrize(
+    "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
+)
+def test_kmr_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
+@pytest.mark.parametrize("word", ["deh"])
+def test_kmr_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())