Lemmatizer ro (#2319)

* Add Romanian lemmatizer lookup table. Adapted from http://www.lexiconista.com/datasets/lemmatization/ by replacing cedillas with commas (ș and ț). The original dataset is licensed under the Open Database License. * Fix one blatant issue in the Romanian lemmatizer * Romanian examples file * Add ro_tokenizer in conftest * Add Romanian lemmatizer test
2025-10-31 07:57:35 +03:00 · 2018-05-12 16:20:04 +03:00 · 2018-05-12 16:20:04 +03:00 · 0e08e49e87
commit 0e08e49e87
parent ae3719ece5
6 changed files with 314860 additions and 1 deletions
--- a/spacy/lang/ro/init.py
+++ b/spacy/lang/ro/init.py
@ -3,6 +3,7 @@ from __future__ import unicode_literals
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@ -17,6 +18,7 @@ class RomanianDefaults(Language.Defaults):
    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
    stop_words = STOP_WORDS
    lemma_lookup = LOOKUP
 class Romanian(Language):
--- a/spacy/lang/ro/examples.py
+++ b/spacy/lang/ro/examples.py
@ -0,0 +1,23 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.ro import Romanian
 >>> from spacy.lang.ro.examples import sentences
 >>> nlp = Romanian()
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari",
    "Municipalitatea din San Francisco ia în calcul interzicerea roboților curieri pe trotuar",
    "Londra este un oraș mare în Regatul Unit",
    "Unde ești?",
    "Cine este președintele Franței?",
    "Care este capitala Statelor Unite?",
    "Când s-a născut Barack Obama?"
 ]
--- a/spacy/lang/ro/lemmatizer.py
+++ b/spacy/lang/ro/lemmatizer.py
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -15,7 +15,7 @@ from .. import util
 # here if it's using spaCy's tokenizer (not a different library)
 # TODO: re-implement generic tokenizer tests
 _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
-              'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'tr', 'xx']
+              'it', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'tr', 'xx']
 _models = {'en': ['en_core_web_sm'],
           'de': ['de_core_news_md'],
           'fr': ['fr_core_news_sm'],
@ -100,6 +100,11 @@ def fi_tokenizer():
    return util.get_lang_class('fi').Defaults.create_tokenizer()
@pytest.fixture
 def ro_tokenizer():
    return util.get_lang_class('ro').Defaults.create_tokenizer()
@pytest.fixture
 def id_tokenizer():
    return util.get_lang_class('id').Defaults.create_tokenizer()
--- a/spacy/tests/lang/ro/init.py
+++ b/spacy/tests/lang/ro/init.py
--- a/spacy/tests/lang/ro/test_lemmatizer.py
+++ b/spacy/tests/lang/ro/test_lemmatizer.py
@ -0,0 +1,13 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
@pytest.mark.parametrize('string,lemma', [('câini', 'câine'),
                                          ('expedițiilor', 'expediție'),
                                          ('pensete', 'pensetă'),
                                          ('erau', 'fi')])
 def test_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma):
    tokens = ro_tokenizer(string)
    assert tokens[0].lemma_ == lemma