mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
adds Croatian lemma_lookup.json, license file and corresponding tests (#4252)
This commit is contained in:
parent
aec755d3a3
commit
b01025dd06
|
@ -18,6 +18,7 @@ class CroatianDefaults(Language.Defaults):
|
||||||
)
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||||
|
|
||||||
|
|
||||||
class Croatian(Language):
|
class Croatian(Language):
|
||||||
|
|
1313609
spacy/lang/hr/lemma_lookup.json
Normal file
1313609
spacy/lang/hr/lemma_lookup.json
Normal file
File diff suppressed because it is too large
Load Diff
15
spacy/lang/hr/lemma_lookup_license.txt
Normal file
15
spacy/lang/hr/lemma_lookup_license.txt
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
|
||||||
|
Reldi-tagger is licesned under the Apache 2.0 licence.
|
||||||
|
|
||||||
|
@InProceedings{ljubesic16-new,
|
||||||
|
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
||||||
|
title = {New Inflectional Lexicons and Training Corpora for Improved Morphosyntactic Annotation of Croatian and Serbian},
|
||||||
|
booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
|
||||||
|
year = {2016},
|
||||||
|
date = {23-28},
|
||||||
|
location = {Portorož, Slovenia},
|
||||||
|
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Sara Goggi and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Helene Mazo and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
|
||||||
|
publisher = {European Language Resources Association (ELRA)},
|
||||||
|
address = {Paris, France},
|
||||||
|
isbn = {978-2-9517408-9-1}
|
||||||
|
}
|
|
@ -103,6 +103,11 @@ def he_tokenizer():
|
||||||
return get_lang_class("he").Defaults.create_tokenizer()
|
return get_lang_class("he").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def hr_tokenizer():
|
||||||
|
return get_lang_class("hr").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def hu_tokenizer():
|
def hu_tokenizer():
|
||||||
return get_lang_class("hu").Defaults.create_tokenizer()
|
return get_lang_class("hu").Defaults.create_tokenizer()
|
||||||
|
|
20
spacy/tests/lang/hr/test_lemma.py
Normal file
20
spacy/tests/lang/hr/test_lemma.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"string,lemma",
|
||||||
|
[
|
||||||
|
("trčao", "trčati"),
|
||||||
|
("adekvatnim", "adekvatan"),
|
||||||
|
("dekontaminacijama", "dekontaminacija"),
|
||||||
|
("filologovih", "filologov"),
|
||||||
|
("je", "biti"),
|
||||||
|
("se", "sebe"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_hr_lemmatizer_lookup_assigns(hr_tokenizer, string, lemma):
|
||||||
|
tokens = hr_tokenizer(string)
|
||||||
|
assert tokens[0].lemma_ == lemma
|
Loading…
Reference in New Issue
Block a user