Add Kurdish Kurmanji language (#13561)

* Add Kurdish Kurmanji language * Add lex_attrs
2025-07-04 11:53:09 +03:00 · 2024-09-09 12:15:40 +03:00 · 2024-09-09 12:15:40 +03:00 · acbf2a428f
commit acbf2a428f
parent 55db9c2e87
9 changed files with 239 additions and 1 deletions
--- a/spacy/lang/kmr/init.py
+++ b/spacy/lang/kmr/init.py
@ -0,0 +1,15 @@
 from .lex_attrs import LEX_ATTRS
 from ...language import BaseDefaults, Language
 from .stop_words import STOP_WORDS
 class KurmanjiDefaults(BaseDefaults):
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS
 class Kurmanji(Language):
    lang = "kmr"
    Defaults = KurmanjiDefaults
 __all__ = ["Kurmanji"]
--- a/spacy/lang/kmr/examples.py
+++ b/spacy/lang/kmr/examples.py
@ -0,0 +1,17 @@
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.kmr.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Berê mirovan her tim li geşedana pêşerojê ye",  # People's gaze is always on the development of the future
    "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.",  # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
    "Mem Ararat hunermendekî Kurd yê bi nav û deng e.",  # Mem Ararat is a famous Kurdish artist
    "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.",  # Firat Ceweri has been writing Kurdish books for 40 years
    "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand",  # The young journalist wrote an interesting news article about the economic situation
    "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne",  # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
    "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn",  # Talented students succeeded in the mathematics competition
    "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.",  # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
 ]
--- a/spacy/lang/kmr/lex_attrs.py
+++ b/spacy/lang/kmr/lex_attrs.py
@ -0,0 +1,139 @@
 from ...attrs import LIKE_NUM
 _num_words = [
    "sifir",
    "yek",
    "du",
    "sê",
    "çar",
    "pênc",
    "şeş",
    "heft",
    "heşt",
    "neh",
    "deh",
    "yazde",
    "dazde",
    "sêzde",
    "çarde",
    "pazde",
    "şazde",
    "hevde",
    "hejde",
    "nozde",
    "bîst",
    "sî",
    "çil",
    "pêncî",
    "şêst",
    "heftê",
    "heştê",
    "nod",
    "sed",
    "hezar",
    "milyon",
    "milyar",
 ]
 _ordinal_words = [
    "yekem",
    "yekemîn",
    "duyem",
    "duyemîn",
    "sêyem",
    "sêyemîn",
    "çarem",
    "çaremîn",
    "pêncem",
    "pêncemîn",
    "şeşem",
    "şeşemîn",
    "heftem",
    "heftemîn",
    "heştem",
    "heştemîn",
    "nehem",
    "nehemîn",
    "dehem",
    "dehemîn",
    "yazdehem",
    "yazdehemîn",
    "dazdehem",
    "dazdehemîn",
    "sêzdehem",
    "sêzdehemîn",
    "çardehem",
    "çardehemîn",
    "pazdehem",
    "pazdehemîn",
    "şanzdehem",
    "şanzdehemîn",
    "hevdehem",
    "hevdehemîn",
    "hejdehem",
    "hejdehemîn",
    "nozdehem",
    "nozdehemîn",
    "bîstem",
    "bîstemîn",
    "sîyem",
    "sîyemîn",
    "çilem",
    "çilemîn",
    "pêncîyem",
    "pênciyemîn",
    "şêstem",
    "şêstemîn",
    "heftêyem",
    "heftêyemîn",
    "heştêyem",
    "heştêyemîn",
    "notem",
    "notemîn",
    "sedem",
    "sedemîn",
    "hezarem",
    "hezaremîn",
    "milyonem",
    "milyonemîn",
    "milyarem",
    "milyaremîn",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # Check ordinal number
    if text_lower in _ordinal_words:
        return True
    if is_digit(text_lower):
        return True
    return False
 def is_digit(text):
    endings = ("em", "yem", "emîn", "yemîn")
    for ending in endings:
        to = len(ending)
        if text.endswith(ending) and text[:-to].isdigit():
            return True
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/kmr/stop_words.py
+++ b/spacy/lang/kmr/stop_words.py
@ -0,0 +1,44 @@
 STOP_WORDS = set(
    """
 û
 li
 bi
 di
 da
 de
 ji
 ku
 ew
 ez
 tu
 em
 hûn
 ew
 ev
 min
 te
 wî
 wê
 me
 we
 wan
 vê
 vî
 va
 çi
 kî
 kê
 çawa
 çima
 kengî
 li ku
 çend
 çiqas
 her
 hin
 gelek
 hemû
 kes
 tişt
 """.split()
 )
--- a/spacy/tests/lang/kmr/init.py
+++ b/spacy/tests/lang/kmr/init.py
--- a/spacy/tests/lang/kmr/test_text.py
+++ b/spacy/tests/lang/kmr/test_text.py
@ -0,0 +1,16 @@
 import pytest
 from spacy.lang.kmr.lex_attrs import like_num
@pytest.mark.parametrize(
    "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
 )
 def test_kmr_lex_attrs_like_number_for_ordinal(word):
    assert like_num(word)
@pytest.mark.parametrize("word", ["deh"])
 def test_kmr_lex_attrs_capitals(word):
    assert like_num(word)
    assert like_num(word.upper())
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
             "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+             "tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
 # fmt: on
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@ -57,6 +57,7 @@ LANGUAGES = [
    pytest.param("tr", marks=pytest.mark.slow()),
    pytest.param("tt", marks=pytest.mark.slow()),
    pytest.param("ur", marks=pytest.mark.slow()),
    pytest.param("kmr", marks=pytest.mark.slow()),
 ]
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -480,6 +480,12 @@
            ],
            "example": "这是一个用于示例的句子。",
            "has_examples": true
        },
        {
            "code": "kmr",
            "name": "Kurdish Kurmanji",
            "example": "Ev hevokek e",
            "has_examples": true
        }
    ],
    "licenses": [