diff --git a/spacy/lang/kmr/__init__.py b/spacy/lang/kmr/__init__.py new file mode 100644 index 000000000..defdf0429 --- /dev/null +++ b/spacy/lang/kmr/__init__.py @@ -0,0 +1,12 @@ +from ...language import BaseDefaults, Language +from .stop_words import STOP_WORDS + +class KurmanjiDefaults(BaseDefaults): + stop_words = STOP_WORDS + + +class Kurmanji(Language): + lang = "kmr" + Defaults = KurmanjiDefaults + +__all__ = ["Kurmanji"] diff --git a/spacy/lang/kmr/examples.py b/spacy/lang/kmr/examples.py new file mode 100644 index 000000000..5eb362001 --- /dev/null +++ b/spacy/lang/kmr/examples.py @@ -0,0 +1,17 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.kmr.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future + "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years. + "Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist + "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years + "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation + "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide + "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition + "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me +] diff --git a/spacy/lang/kmr/stop_words.py b/spacy/lang/kmr/stop_words.py new file mode 100644 index 000000000..aee33c2b7 --- /dev/null +++ b/spacy/lang/kmr/stop_words.py @@ -0,0 +1,44 @@ +STOP_WORDS = set( + """ +û +li +bi +di +da +de +ji +ku +ew +ez +tu +em +hûn +ew +ev +min +te +wî +wê +me +we +wan +vê +vî +va +çi +kî +kê +çawa +çima +kengî +li ku +çend +çiqas +her +hin +gelek +hemû +kes +tişt +""".split() +) diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 8a158647a..9b9ca4834 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", - "tr", "tt", "uk", "ur", "xx", "yo"] + "tr", "tt", "uk", "ur", "xx", "yo", "kmr"] # fmt: on diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 78932f653..d8241a81c 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -57,6 +57,7 @@ LANGUAGES = [ pytest.param("tr", marks=pytest.mark.slow()), pytest.param("tt", marks=pytest.mark.slow()), pytest.param("ur", marks=pytest.mark.slow()), + pytest.param("kmr", marks=pytest.mark.slow()), ] diff --git a/website/meta/languages.json b/website/meta/languages.json index d6a078097..a3717298f 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -480,6 +480,12 @@ ], "example": "这是一个用于示例的句子。", "has_examples": true + }, + { + "code": "kmr", + "name": "Kurdish Kurmanji", + "example": "Ev hevokek e", + "has_examples": true } ], "licenses": [