mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
b892b446cc
* Added the same punctuation rules as danish language. * Added abbreviations and also the possibility to have capitalized abbreviations on some. Added a few specific cases too * Added test for long texts in swedish * Added morph rules, infixes and suffixes to __init__.py for swedish * Added some tests for prefixes, infixes and suffixes * Added tests for lemma * Renamed files to follow convention * [sv] Removed ambigious abbreviations * Added more tests for tokenizer exceptions * Added test for problem with punctuation in issue #2578 * Contributor agreement * Removed faulty lemmatization of 'jag' ('I') as it was lemmatized to 'jaga' ('hunt')
16 lines
675 B
Python
16 lines
675 B
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.mark.parametrize('string,lemma', [('DNA-profilernas', 'DNA-profil'),
|
|
('Elfenbenskustens', 'Elfenbenskusten'),
|
|
('abortmotståndarens', 'abortmotståndare'),
|
|
('kolesterols', 'kolesterol'),
|
|
('portionssnusernas', 'portionssnus'),
|
|
('åsyns', 'åsyn')])
|
|
def test_lemmatizer_lookup_assigns(sv_tokenizer, string, lemma):
|
|
tokens = sv_tokenizer(string)
|
|
assert tokens[0].lemma_ == lemma
|