Lemmatizer ro (#2319)

* Add Romanian lemmatizer lookup table.

Adapted from http://www.lexiconista.com/datasets/lemmatization/
by replacing cedillas with commas (ș and ț).

The original dataset is licensed under the Open Database License.

* Fix one blatant issue in the Romanian lemmatizer

* Romanian examples file

* Add ro_tokenizer in conftest

* Add Romanian lemmatizer test
This commit is contained in:
Jani Monoses 2018-05-12 16:20:04 +03:00 committed by Ines Montani
parent ae3719ece5
commit 0e08e49e87
6 changed files with 314860 additions and 1 deletions

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -17,6 +18,7 @@ class RomanianDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
class Romanian(Language):

23
spacy/lang/ro/examples.py Normal file
View File

@ -0,0 +1,23 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.ro import Romanian
>>> from spacy.lang.ro.examples import sentences
>>> nlp = Romanian()
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari",
"Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar",
"Londra este un oraș mare în Regatul Unit",
"Unde ești?",
"Cine este președintele Franței?",
"Care este capitala Statelor Unite?",
"Când s-a născut Barack Obama?"
]

314816
spacy/lang/ro/lemmatizer.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -15,7 +15,7 @@ from .. import util
# here if it's using spaCy's tokenizer (not a different library)
# TODO: re-implement generic tokenizer tests
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'tr', 'xx']
'it', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'tr', 'xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'],
'fr': ['fr_core_news_sm'],
@ -100,6 +100,11 @@ def fi_tokenizer():
return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture
def ro_tokenizer():
return util.get_lang_class('ro').Defaults.create_tokenizer()
@pytest.fixture
def id_tokenizer():
return util.get_lang_class('id').Defaults.create_tokenizer()

View File

View File

@ -0,0 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('string,lemma', [('câini', 'câine'),
('expedițiilor', 'expediție'),
('pensete', 'pensetă'),
('erau', 'fi')])
def test_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma):
tokens = ro_tokenizer(string)
assert tokens[0].lemma_ == lemma