Add lex_attrs

2025-09-20 02:52:45 +03:00 · 2024-07-10 20:25:35 +03:00 · 2024-07-10 20:25:35 +03:00 · 602304c57d
commit 602304c57d
parent 90b77cfe5e
4 changed files with 158 additions and 0 deletions
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -1,8 +1,11 @@
 from .lex_attrs import LEX_ATTRS
 from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
 class KurmanjiDefaults(BaseDefaults):
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS
 class Kurmanji(Language):
--- a/spacy/lang/kmr/lex_attrs.py
+++ b/spacy/lang/kmr/lex_attrs.py
@ -0,0 +1,139 @@
 from ...attrs import LIKE_NUM
 _num_words = [
    "sifir",
    "yek",
    "du",
    "sê",
    "çar",
    "pênc",
    "şeş",
    "heft",
    "heşt",
    "neh",
    "deh",
    "yazde",
    "dazde",
    "sêzde",
    "çarde",
    "pazde",
    "şazde",
    "hevde",
    "hejde",
    "nozde",
    "bîst",
    "sî",
    "çil",
    "pêncî",
    "şêst",
    "heftê",
    "heştê",
    "nod",
    "sed",
    "hezar",
    "milyon",
    "milyar",
 ]
 _ordinal_words = [
    "yekem",
    "yekemîn",
    "duyem",
    "duyemîn",
    "sêyem",
    "sêyemîn",
    "çarem",
    "çaremîn",
    "pêncem",
    "pêncemîn",
    "şeşem",
    "şeşemîn",
    "heftem",
    "heftemîn",
    "heştem",
    "heştemîn",
    "nehem",
    "nehemîn",
    "dehem",
    "dehemîn",
    "yazdehem",
    "yazdehemîn",
    "dazdehem",
    "dazdehemîn",
    "sêzdehem",
    "sêzdehemîn",
    "çardehem",
    "çardehemîn",
    "pazdehem",
    "pazdehemîn",
    "şanzdehem",
    "şanzdehemîn",
    "hevdehem",
    "hevdehemîn",
    "hejdehem",
    "hejdehemîn",
    "nozdehem",
    "nozdehemîn",
    "bîstem",
    "bîstemîn",
    "sîyem",
    "sîyemîn",
    "çilem",
    "çilemîn",
    "pêncîyem",
    "pênciyemîn",
    "şêstem",
    "şêstemîn",
    "heftêyem",
    "heftêyemîn",
    "heştêyem",
    "heştêyemîn",
    "notem",
    "notemîn",
    "sedem",
    "sedemîn",
    "hezarem",
    "hezaremîn",
    "milyonem",
    "milyonemîn",
    "milyarem",
    "milyaremîn",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
    if is_digit(text_lower):
        return True
    return False
 def is_digit(text):
    endings = ("em", "yem", "emîn", "yemîn")
    for ending in endings:
        to = len(ending)
        if text.endswith(ending) and text[:-to].isdigit():
            return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/tests/lang/kmr/init.py
+++ b/spacy/tests/lang/kmr/init.py
--- a/spacy/tests/lang/kmr/test_text.py
+++ b/spacy/tests/lang/kmr/test_text.py
@ -0,0 +1,16 @@
 import pytest
 from spacy.lang.kmr.lex_attrs import like_num
@pytest.mark.parametrize(
    "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
 )
 def test_kmr_lex_attrs_like_number_for_ordinal(word):
    assert like_num(word)
@pytest.mark.parametrize("word", ["deh"])
 def test_kmr_lex_attrs_capitals(word):
    assert like_num(word)
    assert like_num(word.upper())