diff --git a/spacy/lang/kmr/__init__.py b/spacy/lang/kmr/__init__.py new file mode 100644 index 000000000..379696f23 --- /dev/null +++ b/spacy/lang/kmr/__init__.py @@ -0,0 +1,15 @@ +from .lex_attrs import LEX_ATTRS +from ...language import BaseDefaults, Language +from .stop_words import STOP_WORDS + + +class KurmanjiDefaults(BaseDefaults): + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + + +class Kurmanji(Language): + lang = "kmr" + Defaults = KurmanjiDefaults + +__all__ = ["Kurmanji"] diff --git a/spacy/lang/kmr/examples.py b/spacy/lang/kmr/examples.py new file mode 100644 index 000000000..5eb362001 --- /dev/null +++ b/spacy/lang/kmr/examples.py @@ -0,0 +1,17 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.kmr.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future + "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years. + "Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist + "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years + "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation + "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide + "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition + "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me +] diff --git a/spacy/lang/kmr/lex_attrs.py b/spacy/lang/kmr/lex_attrs.py new file mode 100644 index 000000000..8927ef141 --- /dev/null +++ b/spacy/lang/kmr/lex_attrs.py @@ -0,0 +1,139 @@ +from ...attrs import LIKE_NUM + + +_num_words = [ + "sifir", + "yek", + "du", + "sê", + "çar", + "pênc", + "şeş", + "heft", + "heşt", + "neh", + "deh", + "yazde", + "dazde", + "sêzde", + "çarde", + "pazde", + "şazde", + "hevde", + "hejde", + "nozde", + "bîst", + "sî", + "çil", + "pêncî", + "şêst", + "heftê", + "heştê", + "nod", + "sed", + "hezar", + "milyon", + "milyar", +] + +_ordinal_words = [ + "yekem", + "yekemîn", + "duyem", + "duyemîn", + "sêyem", + "sêyemîn", + "çarem", + "çaremîn", + "pêncem", + "pêncemîn", + "şeşem", + "şeşemîn", + "heftem", + "heftemîn", + "heştem", + "heştemîn", + "nehem", + "nehemîn", + "dehem", + "dehemîn", + "yazdehem", + "yazdehemîn", + "dazdehem", + "dazdehemîn", + "sêzdehem", + "sêzdehemîn", + "çardehem", + "çardehemîn", + "pazdehem", + "pazdehemîn", + "şanzdehem", + "şanzdehemîn", + "hevdehem", + "hevdehemîn", + "hejdehem", + "hejdehemîn", + "nozdehem", + "nozdehemîn", + "bîstem", + "bîstemîn", + "sîyem", + "sîyemîn", + "çilem", + "çilemîn", + "pêncîyem", + "pênciyemîn", + "şêstem", + "şêstemîn", + "heftêyem", + "heftêyemîn", + "heştêyem", + "heştêyemîn", + "notem", + "notemîn", + "sedem", + "sedemîn", + "hezarem", + "hezaremîn", + "milyonem", + "milyonemîn", + "milyarem", + "milyaremîn", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + if text_lower in _num_words: + return True + + # Check ordinal number + if text_lower in _ordinal_words: + return True + + if is_digit(text_lower): + return True + + return False + + +def is_digit(text): + endings = ("em", "yem", "emîn", "yemîn") + for ending in endings: + to = len(ending) + if text.endswith(ending) and text[:-to].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/kmr/stop_words.py b/spacy/lang/kmr/stop_words.py new file mode 100644 index 000000000..aee33c2b7 --- /dev/null +++ b/spacy/lang/kmr/stop_words.py @@ -0,0 +1,44 @@ +STOP_WORDS = set( + """ +û +li +bi +di +da +de +ji +ku +ew +ez +tu +em +hûn +ew +ev +min +te +wî +wê +me +we +wan +vê +vî +va +çi +kî +kê +çawa +çima +kengî +li ku +çend +çiqas +her +hin +gelek +hemû +kes +tişt +""".split() +) diff --git a/spacy/tests/lang/kmr/__init__.py b/spacy/tests/lang/kmr/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/kmr/test_text.py b/spacy/tests/lang/kmr/test_text.py new file mode 100644 index 000000000..209f386ec --- /dev/null +++ b/spacy/tests/lang/kmr/test_text.py @@ -0,0 +1,16 @@ +import pytest + +from spacy.lang.kmr.lex_attrs import like_num + + +@pytest.mark.parametrize( + "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"] +) +def test_kmr_lex_attrs_like_number_for_ordinal(word): + assert like_num(word) + + +@pytest.mark.parametrize("word", ["deh"]) +def test_kmr_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 8a158647a..9b9ca4834 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", - "tr", "tt", "uk", "ur", "xx", "yo"] + "tr", "tt", "uk", "ur", "xx", "yo", "kmr"] # fmt: on diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 78932f653..d8241a81c 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -57,6 +57,7 @@ LANGUAGES = [ pytest.param("tr", marks=pytest.mark.slow()), pytest.param("tt", marks=pytest.mark.slow()), pytest.param("ur", marks=pytest.mark.slow()), + pytest.param("kmr", marks=pytest.mark.slow()), ] diff --git a/website/meta/languages.json b/website/meta/languages.json index d6a078097..a3717298f 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -480,6 +480,12 @@ ], "example": "这是一个用于示例的句子。", "has_examples": true + }, + { + "code": "kmr", + "name": "Kurdish Kurmanji", + "example": "Ev hevokek e", + "has_examples": true } ], "licenses": [