adds Croatian lemma_lookup.json, license file and corresponding tests (#4252)

This commit is contained in:
Ivan Šarić 2019-09-08 13:40:45 +02:00 committed by Matthew Honnibal
parent aec755d3a3
commit b01025dd06
5 changed files with 1313650 additions and 0 deletions

View File

@ -18,6 +18,7 @@ class CroatianDefaults(Language.Defaults):
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
resources = {"lemma_lookup": "lemma_lookup.json"}
class Croatian(Language): class Croatian(Language):

1313609
spacy/lang/hr/lemma_lookup.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,15 @@
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
Reldi-tagger is licesned under the Apache 2.0 licence.
@InProceedings{ljubesic16-new,
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
year = {2016},
date = {23-28},
location = {Portorož, Slovenia},
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
publisher = {European Language Resources Association (ELRA)},
address = {Paris, France},
isbn = {978-2-9517408-9-1}
}

View File

@ -103,6 +103,11 @@ def he_tokenizer():
return get_lang_class("he").Defaults.create_tokenizer() return get_lang_class("he").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def hr_tokenizer():
return get_lang_class("hr").Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def hu_tokenizer(): def hu_tokenizer():
return get_lang_class("hu").Defaults.create_tokenizer() return get_lang_class("hu").Defaults.create_tokenizer()

View File

@ -0,0 +1,20 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"string,lemma",
[
("trčao", "trčati"),
("adekvatnim", "adekvatan"),
("dekontaminacijama", "dekontaminacija"),
("filologovih", "filologov"),
("je", "biti"),
("se", "sebe"),
],
)
def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma):
tokens = hr_tokenizer(string)
assert tokens[0].lemma_ == lemma