From 81314f8659c215e01b6068d134f4ccaa647ba318 Mon Sep 17 00:00:00 2001 From: Vadim Mazaev Date: Tue, 21 Nov 2017 22:23:59 +0300 Subject: [PATCH] Fixed tokenizer: added char classes; added first lemmatizer and tokenizer tests --- spacy/lang/char_classes.py | 11 ++- spacy/tests/conftest.py | 12 ++- spacy/tests/lang/ru/__init__.py | 0 spacy/tests/lang/ru/test_lemmatizer.py | 71 ++++++++++++++ spacy/tests/lang/ru/test_punct.py | 131 +++++++++++++++++++++++++ 5 files changed, 220 insertions(+), 5 deletions(-) create mode 100644 spacy/tests/lang/ru/__init__.py create mode 100644 spacy/tests/lang/ru/test_lemmatizer.py create mode 100644 spacy/tests/lang/ru/test_punct.py diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 7ec631c92..68d8eecc7 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -15,9 +15,11 @@ _hebrew = r'[\p{L}&&\p{Hebrew}]' _latin_lower = r'[\p{Ll}&&\p{Latin}]' _latin_upper = r'[\p{Lu}&&\p{Latin}]' _latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]' +_russian_lower = r'[ёа-я]' +_russian_upper = r'[ЁА-Я]' -_upper = [_latin_upper] -_lower = [_latin_lower] +_upper = [_latin_upper, _russian_upper] +_lower = [_latin_lower, _russian_lower] _uncased = [_bengali, _hebrew] ALPHA = merge_char_classes(_upper + _lower + _uncased) @@ -27,8 +29,9 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased) _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft ' 'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' - 'TB T G M K %') -_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' + 'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм ' + 'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб') +_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽' # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 2d1b03514..73a484c13 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -15,7 +15,7 @@ from .. import util # here if it's using spaCy's tokenizer (not a different library) # TODO: re-implement generic tokenizer tests _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id', - 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] + 'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'xx'] _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_core_news_sm'], @@ -40,6 +40,11 @@ def FR(request): return load_test_model(request.param) +@pytest.fixture() +def RU(request): + return util.get_lang_class('ru') + + #@pytest.fixture(params=_languages) #def tokenizer(request): #lang = util.get_lang_class(request.param) @@ -137,6 +142,11 @@ def th_tokenizer(): return util.get_lang_class('th').Defaults.create_tokenizer() +@pytest.fixture +def ru_tokenizer(): + return util.get_lang_class('ru').Defaults.create_tokenizer() + + @pytest.fixture def stringstore(): return StringStore() diff --git a/spacy/tests/lang/ru/__init__.py b/spacy/tests/lang/ru/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py new file mode 100644 index 000000000..345bcd971 --- /dev/null +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from ....tokens.doc import Doc + + +@pytest.fixture +def ru_lemmatizer(RU): + return RU.Defaults.create_lemmatizer() + + +# @pytest.mark.models('ru') +# def test_doc_lemmatization(RU): +# doc = Doc(RU.vocab, words=['мама', 'мыла', 'раму']) +# doc[0].tag_ = 'NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing' +# doc[1].tag_ = 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act' +# doc[2].tag_ = 'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing' +# +# lemmas = [token.lemma_ for token in doc] +# assert lemmas == ['мама', 'мыть', 'рама'] + + +@pytest.mark.models('ru') +@pytest.mark.parametrize('text,lemmas', [('гвоздики', ['гвоздик', 'гвоздика']), + ('люди', ['человек']), + ('реки', ['река']), + ('кольцо', ['кольцо']), + ('пепперони', ['пепперони'])]) +def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas): + assert sorted(ru_lemmatizer.noun(text)) == lemmas + + +@pytest.mark.models('ru') +@pytest.mark.parametrize('text,pos,morphology,lemma', [('рой', 'NOUN', None, 'рой'), + ('рой', 'VERB', None, 'рыть'), + ('клей', 'NOUN', None, 'клей'), + ('клей', 'VERB', None, 'клеить'), + ('три', 'NUM', None, 'три'), + ('кос', 'NOUN', {'Number': 'Sing'}, 'кос'), + ('кос', 'NOUN', {'Number': 'Plur'}, 'коса'), + ('кос', 'ADJ', None, 'косой'), + ('потом', 'NOUN', None, 'пот'), + ('потом', 'ADV', None, 'потом') + ]) +def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma): + assert ru_lemmatizer(text, pos, morphology) == [lemma] + + +@pytest.mark.models('ru') +@pytest.mark.parametrize('text,morphology,lemma', [('гвоздики', {'Gender': 'Fem'}, 'гвоздика'), + ('гвоздики', {'Gender': 'Masc'}, 'гвоздик'), + ('вина', {'Gender': 'Fem'}, 'вина'), + ('вина', {'Gender': 'Neut'}, 'вино') + ]) +def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma): + assert ru_lemmatizer.noun(text, morphology) == [lemma] + + +# @pytest.mark.models('ru') +# def test_ru_lemmatizer_punct(ru_lemmatizer): +# assert ru_lemmatizer.punct('“') == ['"'] +# assert ru_lemmatizer.punct('“') == ['"'] +# +# +# @pytest.mark.models('ru') +# def test_ru_lemmatizer_lemma_assignment(RU): +# text = "А роза упала на лапу Азора." +# doc = RU.make_doc(text) +# RU.tagger(doc) +# assert all(t.lemma_ != '' for t in doc) diff --git a/spacy/tests/lang/ru/test_punct.py b/spacy/tests/lang/ru/test_punct.py new file mode 100644 index 000000000..6a689ff71 --- /dev/null +++ b/spacy/tests/lang/ru/test_punct.py @@ -0,0 +1,131 @@ +# coding: utf-8 +"""Test that open, closed and paired punctuation is split off correctly.""" + + +from __future__ import unicode_literals + +import pytest + +from ....util import compile_prefix_regex +from ....lang.punctuation import TOKENIZER_PREFIXES + + +PUNCT_OPEN = ['(', '[', '{', '*'] +PUNCT_CLOSE = [')', ']', '}', '*'] +PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] + + +@pytest.mark.parametrize('text', ["(", "((", "<"]) +def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert len(tokens) == len(text) + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text): + tokens = ru_tokenizer(punct + text) + assert len(tokens) == 2 + assert tokens[0].text == punct + assert tokens[1].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text): + tokens = ru_tokenizer(text + punct) + assert len(tokens) == 2 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('punct_add', ["`"]) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text): + tokens = ru_tokenizer(punct + punct_add + text) + assert len(tokens) == 3 + assert tokens[0].text == punct + assert tokens[1].text == punct_add + assert tokens[2].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('punct_add', ["'"]) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text): + tokens = ru_tokenizer(text + punct + punct_add) + assert len(tokens) == 3 + assert tokens[0].text == text + assert tokens[1].text == punct + assert tokens[2].text == punct_add + + +@pytest.mark.parametrize('punct', PUNCT_OPEN) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text): + tokens = ru_tokenizer(punct + punct + punct + text) + assert len(tokens) == 4 + assert tokens[0].text == punct + assert tokens[3].text == text + + +@pytest.mark.parametrize('punct', PUNCT_CLOSE) +@pytest.mark.parametrize('text', ["Привет"]) +def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text): + tokens = ru_tokenizer(text + punct + punct + punct) + assert len(tokens) == 4 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize('text', ["'Тест"]) +def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == "'" + + +@pytest.mark.parametrize('text', ["Тест''"]) +def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert len(tokens) == 2 + tokens_punct = ru_tokenizer("''") + assert len(tokens_punct) == 1 + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('text', ["Тест"]) +def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open, + punct_close, text): + tokens = ru_tokenizer(punct_open + text + punct_close) + assert len(tokens) == 3 + assert tokens[0].text == punct_open + assert tokens[1].text == text + assert tokens[2].text == punct_close + + +@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) +@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) +@pytest.mark.parametrize('text', ["Тест"]) +def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close, + punct_open2, punct_close2, text): + tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) + assert len(tokens) == 5 + assert tokens[0].text == punct_open2 + assert tokens[1].text == punct_open + assert tokens[2].text == text + assert tokens[3].text == punct_close + assert tokens[4].text == punct_close2 + + +@pytest.mark.parametrize('text', [("Тест.")]) +def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text): + tokens = ru_tokenizer(text) + assert tokens[1].text == "." + + +def test_ru_tokenizer_splits_bracket_period(ru_tokenizer): + text = "(Раз, два, три, проверка)." + tokens = ru_tokenizer(text) + assert tokens[len(tokens) - 1].text == "."