mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-12 04:38:28 +03:00
0e08e49e87
* Add Romanian lemmatizer lookup table. Adapted from http://www.lexiconista.com/datasets/lemmatization/ by replacing cedillas with commas (ș and ț). The original dataset is licensed under the Open Database License. * Fix one blatant issue in the Romanian lemmatizer * Romanian examples file * Add ro_tokenizer in conftest * Add Romanian lemmatizer test
14 lines
474 B
Python
14 lines
474 B
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.mark.parametrize('string,lemma', [('câini', 'câine'),
|
|
('expedițiilor', 'expediție'),
|
|
('pensete', 'pensetă'),
|
|
('erau', 'fi')])
|
|
def test_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma):
|
|
tokens = ro_tokenizer(string)
|
|
assert tokens[0].lemma_ == lemma
|