Lemmatizer lookup dictionary for Serbian and basic tag set adde… (#4251)

* Serbian stopwords added. (cyrillic alphabet) * spaCy Contribution agreement included. * Test initialize updated * Serbian language code update. --bugfix * Tokenizer exceptions added. Init file updated. * Norm exceptions and lexical attributes added. * Examples added. * Tests added. * sr_lang examples update. * Tokenizer exceptions updated. (Serbian) * Lemmatizer created. Licence included. * Test updated. * Tag map basic added. * tag_map.py file removed since it uses default spacy tags.
2025-10-27 14:11:04 +03:00 · 2019-09-08 14:19:15 +02:00 · 2019-09-08 14:19:15 +02:00 · d03401f532
commit d03401f532
parent b01025dd06
6 changed files with 253374 additions and 3 deletions
--- a/spacy/lang/sr/init.py
+++ b/spacy/lang/sr/init.py
@ -21,6 +21,7 @@ class SerbianDefaults(Language.Defaults):
    )
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
+    resources = {"lemma_lookup": "lemma_lookup.json"}


 class Serbian(Language):
--- a/spacy/lang/sr/examples.py
+++ b/spacy/lang/sr/examples.py
@ -12,13 +12,14 @@ Example sentences to test spaCy and its language models.

 sentences = [
    # Translations from English
-    "Apple планира куповину америчког стартапа за $1 милијарду."
+    "Apple планира куповину америчког стартапа за $1 милијарду.",
    "Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.",
    "Лондон је велики град у Уједињеном Краљевству.",
    "Где си ти?",
    "Ко је председник Француске?",
    # Serbian common and slang
    "Moj ћале је инжењер!",
-    "Новак Ђоковић је најбољи тенисер света." "У Пироту има добрих кафана!",
+    "Новак Ђоковић је најбољи тенисер света.",
+    "У Пироту има добрих кафана!",
    "Музеј Николе Тесле се налази у Београду.",
 ]
--- a/spacy/lang/sr/lemma_lookup.json
+++ b/spacy/lang/sr/lemma_lookup.json
--- a/spacy/lang/sr/lemma_lookup_licence.txt
+++ b/spacy/lang/sr/lemma_lookup_licence.txt
@ -0,0 +1,32 @@
+Copyright @InProceedings{ljubesic16-new,
+  author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
+  title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
+  booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
+  year = {2016},
+  date = {23-28},
+  location = {Portorož, Slovenia},
+  editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
+  publisher = {European Language Resources Association (ELRA)},
+  address = {Paris, France},
+  isbn = {978-2-9517408-9-1}
+ }
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+The licence of Serbian lemmas was adopted from Serbian lexicon:
+ - sr.lexicon (https://github.com/clarinsi/reldi-tagger/blob/master/sr.lexicon)
+
+Changelog:
+ - Lexicon is translated into cyrilic
+ - Word order is sorted
--- a/spacy/lang/sr/tokenizer_exceptions.py
+++ b/spacy/lang/sr/tokenizer_exceptions.py
@ -15,6 +15,7 @@ _abbrev_exc = [
    {ORTH: "пет", LEMMA: "петак", NORM: "петак"},
    {ORTH: "суб", LEMMA: "субота", NORM: "субота"},
    {ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
+
    # Months abbreviations
    {ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
    {ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
@ -27,7 +28,7 @@ _abbrev_exc = [
    {ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
    {ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
    {ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
-    {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
+    {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
 ]


--- a/spacy/tests/lang/sr/test_lemmatizer.py
+++ b/spacy/tests/lang/sr/test_lemmatizer.py
@ -0,0 +1,20 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    "string,lemma",
+    [
+        ("најадекватнији", "адекватан"),
+        ("матурирао", "матурирати"),
+        ("планираћемо", "планирати"),
+        ("певају", "певати"),
+        ("нама", "ми"),
+        ("се", "себе"),
+    ],
+)
+def test_sr_lemmatizer_lookup_assigns(sr_tokenizer, string, lemma):
+    tokens = sr_tokenizer(string)
+    assert tokens[0].lemma_ == lemma