Fixed tokenizer: added char classes; added first lemmatizer and

tokenizer tests
This commit is contained in:
Vadim Mazaev 2017-11-21 22:23:59 +03:00
parent 52ee1f9bf9
commit 81314f8659
5 changed files with 220 additions and 5 deletions

View File

@ -15,9 +15,11 @@ _hebrew = r'[\p{L}&&\p{Hebrew}]'
_latin_lower = r'[\p{Ll}&&\p{Latin}]'
_latin_upper = r'[\p{Lu}&&\p{Latin}]'
_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
_russian_lower = r'[ёа-я]'
_russian_upper = r'[ЁА-Я]'
_upper = [_latin_upper]
_lower = [_latin_lower]
_upper = [_latin_upper, _russian_upper]
_lower = [_latin_lower, _russian_lower]
_uncased = [_bengali, _hebrew]
ALPHA = merge_char_classes(_upper + _lower + _uncased)
@ -27,8 +29,9 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
'TB T G M K %')
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽'
# These expressions contain various unicode variations, including characters
# used in Chinese (see #1333, #1340, #1351) unless there are cross-language

View File

@ -15,7 +15,7 @@ from .. import util
# here if it's using spaCy's tokenizer (not a different library)
# TODO: re-implement generic tokenizer tests
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'],
'fr': ['fr_core_news_sm'],
@ -40,6 +40,11 @@ def FR(request):
return load_test_model(request.param)
@pytest.fixture()
def RU(request):
return util.get_lang_class('ru')
#@pytest.fixture(params=_languages)
#def tokenizer(request):
#lang = util.get_lang_class(request.param)
@ -137,6 +142,11 @@ def th_tokenizer():
return util.get_lang_class('th').Defaults.create_tokenizer()
@pytest.fixture
def ru_tokenizer():
return util.get_lang_class('ru').Defaults.create_tokenizer()
@pytest.fixture
def stringstore():
return StringStore()

View File

View File

@ -0,0 +1,71 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from ....tokens.doc import Doc
@pytest.fixture
def ru_lemmatizer(RU):
return RU.Defaults.create_lemmatizer()
# @pytest.mark.models('ru')
# def test_doc_lemmatization(RU):
# doc = Doc(RU.vocab, words=['мама', 'мыла', 'раму'])
# doc[0].tag_ = 'NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing'
# doc[1].tag_ = 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act'
# doc[2].tag_ = 'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing'
#
# lemmas = [token.lemma_ for token in doc]
# assert lemmas == ['мама', 'мыть', 'рама']
@pytest.mark.models('ru')
@pytest.mark.parametrize('text,lemmas', [('гвоздики', ['гвоздик', 'гвоздика']),
('люди', ['человек']),
('реки', ['река']),
('кольцо', ['кольцо']),
('пепперони', ['пепперони'])])
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
assert sorted(ru_lemmatizer.noun(text)) == lemmas
@pytest.mark.models('ru')
@pytest.mark.parametrize('text,pos,morphology,lemma', [('рой', 'NOUN', None, 'рой'),
('рой', 'VERB', None, 'рыть'),
('клей', 'NOUN', None, 'клей'),
('клей', 'VERB', None, 'клеить'),
('три', 'NUM', None, 'три'),
('кос', 'NOUN', {'Number': 'Sing'}, 'кос'),
('кос', 'NOUN', {'Number': 'Plur'}, 'коса'),
('кос', 'ADJ', None, 'косой'),
('потом', 'NOUN', None, 'пот'),
('потом', 'ADV', None, 'потом')
])
def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma):
assert ru_lemmatizer(text, pos, morphology) == [lemma]
@pytest.mark.models('ru')
@pytest.mark.parametrize('text,morphology,lemma', [('гвоздики', {'Gender': 'Fem'}, 'гвоздика'),
('гвоздики', {'Gender': 'Masc'}, 'гвоздик'),
('вина', {'Gender': 'Fem'}, 'вина'),
('вина', {'Gender': 'Neut'}, 'вино')
])
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
assert ru_lemmatizer.noun(text, morphology) == [lemma]
# @pytest.mark.models('ru')
# def test_ru_lemmatizer_punct(ru_lemmatizer):
# assert ru_lemmatizer.punct('“') == ['"']
# assert ru_lemmatizer.punct('“') == ['"']
#
#
# @pytest.mark.models('ru')
# def test_ru_lemmatizer_lemma_assignment(RU):
# text = "А роза упала на лапу Азора."
# doc = RU.make_doc(text)
# RU.tagger(doc)
# assert all(t.lemma_ != '' for t in doc)

View File

@ -0,0 +1,131 @@
# coding: utf-8
"""Test that open, closed and paired punctuation is split off correctly."""
from __future__ import unicode_literals
import pytest
from ....util import compile_prefix_regex
from ....lang.punctuation import TOKENIZER_PREFIXES
PUNCT_OPEN = ['(', '[', '{', '*']
PUNCT_CLOSE = [')', ']', '}', '*']
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
@pytest.mark.parametrize('text', ["(", "((", "<"])
def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert len(tokens) == len(text)
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Привет"])
def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
tokens = ru_tokenizer(punct + text)
assert len(tokens) == 2
assert tokens[0].text == punct
assert tokens[1].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Привет"])
def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
tokens = ru_tokenizer(text + punct)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('punct_add', ["`"])
@pytest.mark.parametrize('text', ["Привет"])
def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text):
tokens = ru_tokenizer(punct + punct_add + text)
assert len(tokens) == 3
assert tokens[0].text == punct
assert tokens[1].text == punct_add
assert tokens[2].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('punct_add', ["'"])
@pytest.mark.parametrize('text', ["Привет"])
def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text):
tokens = ru_tokenizer(text + punct + punct_add)
assert len(tokens) == 3
assert tokens[0].text == text
assert tokens[1].text == punct
assert tokens[2].text == punct_add
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Привет"])
def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
tokens = ru_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4
assert tokens[0].text == punct
assert tokens[3].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Привет"])
def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
tokens = ru_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize('text', ["'Тест"])
def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == "'"
@pytest.mark.parametrize('text', ["Тест''"])
def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert len(tokens) == 2
tokens_punct = ru_tokenizer("''")
assert len(tokens_punct) == 1
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('text', ["Тест"])
def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
punct_close, text):
tokens = ru_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3
assert tokens[0].text == punct_open
assert tokens[1].text == text
assert tokens[2].text == punct_close
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
@pytest.mark.parametrize('text', ["Тест"])
def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
punct_open2, punct_close2, text):
tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5
assert tokens[0].text == punct_open2
assert tokens[1].text == punct_open
assert tokens[2].text == text
assert tokens[3].text == punct_close
assert tokens[4].text == punct_close2
@pytest.mark.parametrize('text', [("Тест.")])
def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert tokens[1].text == "."
def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
text = "(Раз, два, три, проверка)."
tokens = ru_tokenizer(text)
assert tokens[len(tokens) - 1].text == "."