adds Croatian lemma_lookup.json, license file and corresponding tests (#4252)

2025-12-16 22:54:18 +03:00 · 2019-09-08 13:40:45 +02:00 · 2019-09-08 13:40:45 +02:00 · b01025dd06
commit b01025dd06
parent aec755d3a3
5 changed files with 1313650 additions and 0 deletions
--- a/spacy/lang/hr/init.py
+++ b/spacy/lang/hr/init.py
@ -18,6 +18,7 @@ class CroatianDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
    stop_words = STOP_WORDS
+    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Croatian(Language):
--- a/spacy/lang/hr/lemma_lookup.json
+++ b/spacy/lang/hr/lemma_lookup.json
--- a/spacy/lang/hr/lemma_lookup_license.txt
+++ b/spacy/lang/hr/lemma_lookup_license.txt
@ -0,0 +1,15 @@
+The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
+Reldi-tagger is licesned under the Apache 2.0 licence.
+
+@InProceedings{ljubesic16-new,
+  author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
+  title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
+  booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
+  year = {2016},
+  date = {23-28},
+  location = {Portorož, Slovenia},
+  editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
+  publisher = {European Language Resources Association (ELRA)},
+  address = {Paris, France},
+  isbn = {978-2-9517408-9-1}
+ }
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -103,6 +103,11 @@ def he_tokenizer():
    return get_lang_class("he").Defaults.create_tokenizer()


+@pytest.fixture(scope="session")
+def hr_tokenizer():
+    return get_lang_class("hr").Defaults.create_tokenizer()
+
+
@pytest.fixture
 def hu_tokenizer():
    return get_lang_class("hu").Defaults.create_tokenizer()
--- a/spacy/tests/lang/hr/test_lemma.py
+++ b/spacy/tests/lang/hr/test_lemma.py
@ -0,0 +1,20 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "string,lemma",
+    [
+        ("trčao", "trčati"),
+        ("adekvatnim", "adekvatan"),
+        ("dekontaminacijama", "dekontaminacija"),
+        ("filologovih", "filologov"),
+        ("je", "biti"),
+        ("se", "sebe"),
+    ],
+)
+def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma):
+    tokens = hr_tokenizer(string)
+    assert tokens[0].lemma_ == lemma