Lemmatizer lookup dictionary for Serbian and basic tag set adde… (#4251)

* Serbian stopwords added. (cyrillic alphabet)

* spaCy Contribution agreement included.

* Test initialize updated

* Serbian language code update. --bugfix

* Tokenizer exceptions added. Init file updated.

* Norm exceptions and lexical attributes added.

* Examples added.

* Tests added.

* sr_lang examples update.

* Tokenizer exceptions updated. (Serbian)

* Lemmatizer created. Licence included.

* Test updated.

* Tag map basic added.

* tag_map.py file removed since it uses default spacy tags.
This commit is contained in:
Pavle Vidanović 2019-09-08 14:19:15 +02:00 committed by Ines Montani
parent b01025dd06
commit d03401f532
6 changed files with 253374 additions and 3 deletions

View File

@ -21,6 +21,7 @@ class SerbianDefaults(Language.Defaults):
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Serbian(Language):

View File

@ -12,13 +12,14 @@ Example sentences to test spaCy and its language models.
sentences = [
# Translations from English
"Apple планира куповину америчког стартапа за $1 милијарду."
"Apple планира куповину америчког стартапа за $1 милијарду.",
"Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.",
"Лондон је велики град у Уједињеном Краљевству.",
"Где си ти?",
"Ко је председник Француске?",
# Serbian common and slang
"Moj ћале је инжењер!",
"Новак Ђоковић је најбољи тенисер света." "У Пироту има добрих кафана!",
"Новак Ђоковић је најбољи тенисер света.",
"У Пироту има добрих кафана!",
"Музеј Николе Тесле се налази у Београду.",
]

253316
spacy/lang/sr/lemma_lookup.json Executable file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,32 @@
Copyright @InProceedings{ljubesic16-new,
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
year = {2016},
date = {23-28},
location = {Portorož, Slovenia},
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
publisher = {European Language Resources Association (ELRA)},
address = {Paris, France},
isbn = {978-2-9517408-9-1}
}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
The licence of Serbian lemmas was adopted from Serbian lexicon:
- sr.lexicon (https://github.com/clarinsi/reldi-tagger/blob/master/sr.lexicon)
Changelog:
- Lexicon is translated into cyrilic
- Word order is sorted

View File

@ -15,6 +15,7 @@ _abbrev_exc = [
{ORTH: "пет", LEMMA: "петак", NORM: "петак"},
{ORTH: "суб", LEMMA: "субота", NORM: "субота"},
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
# Months abbreviations
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
@ -27,7 +28,7 @@ _abbrev_exc = [
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
]

View File

@ -0,0 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("најадекватнији", "адекватан"),
("матурирао", "матурирати"),
("планираћемо", "планирати"),
("певају", "певати"),
("нама", "ми"),
("се", "себе"),
],
)
def test_sr_lemmatizer_lookup_assigns(sr_tokenizer, string, lemma):
tokens = sr_tokenizer(string)
assert tokens[0].lemma_ == lemma