mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fixed tokenizer: added char classes; added first lemmatizer and
tokenizer tests
This commit is contained in:
parent
52ee1f9bf9
commit
81314f8659
|
@ -15,9 +15,11 @@ _hebrew = r'[\p{L}&&\p{Hebrew}]'
|
|||
_latin_lower = r'[\p{Ll}&&\p{Latin}]'
|
||||
_latin_upper = r'[\p{Lu}&&\p{Latin}]'
|
||||
_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
|
||||
_russian_lower = r'[ёа-я]'
|
||||
_russian_upper = r'[ЁА-Я]'
|
||||
|
||||
_upper = [_latin_upper]
|
||||
_lower = [_latin_lower]
|
||||
_upper = [_latin_upper, _russian_upper]
|
||||
_lower = [_latin_lower, _russian_lower]
|
||||
_uncased = [_bengali, _hebrew]
|
||||
|
||||
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||||
|
@ -27,8 +29,9 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
|||
|
||||
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
|
||||
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
||||
'TB T G M K %')
|
||||
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
||||
'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
|
||||
'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
|
||||
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽'
|
||||
|
||||
# These expressions contain various unicode variations, including characters
|
||||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||||
|
|
|
@ -15,7 +15,7 @@ from .. import util
|
|||
# here if it's using spaCy's tokenizer (not a different library)
|
||||
# TODO: re-implement generic tokenizer tests
|
||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
|
||||
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
|
||||
'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'xx']
|
||||
_models = {'en': ['en_core_web_sm'],
|
||||
'de': ['de_core_news_md'],
|
||||
'fr': ['fr_core_news_sm'],
|
||||
|
@ -40,6 +40,11 @@ def FR(request):
|
|||
return load_test_model(request.param)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def RU(request):
|
||||
return util.get_lang_class('ru')
|
||||
|
||||
|
||||
#@pytest.fixture(params=_languages)
|
||||
#def tokenizer(request):
|
||||
#lang = util.get_lang_class(request.param)
|
||||
|
@ -137,6 +142,11 @@ def th_tokenizer():
|
|||
return util.get_lang_class('th').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ru_tokenizer():
|
||||
return util.get_lang_class('ru').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stringstore():
|
||||
return StringStore()
|
||||
|
|
0
spacy/tests/lang/ru/__init__.py
Normal file
0
spacy/tests/lang/ru/__init__.py
Normal file
71
spacy/tests/lang/ru/test_lemmatizer.py
Normal file
71
spacy/tests/lang/ru/test_lemmatizer.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from ....tokens.doc import Doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ru_lemmatizer(RU):
|
||||
return RU.Defaults.create_lemmatizer()
|
||||
|
||||
|
||||
# @pytest.mark.models('ru')
|
||||
# def test_doc_lemmatization(RU):
|
||||
# doc = Doc(RU.vocab, words=['мама', 'мыла', 'раму'])
|
||||
# doc[0].tag_ = 'NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing'
|
||||
# doc[1].tag_ = 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act'
|
||||
# doc[2].tag_ = 'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing'
|
||||
#
|
||||
# lemmas = [token.lemma_ for token in doc]
|
||||
# assert lemmas == ['мама', 'мыть', 'рама']
|
||||
|
||||
|
||||
@pytest.mark.models('ru')
|
||||
@pytest.mark.parametrize('text,lemmas', [('гвоздики', ['гвоздик', 'гвоздика']),
|
||||
('люди', ['человек']),
|
||||
('реки', ['река']),
|
||||
('кольцо', ['кольцо']),
|
||||
('пепперони', ['пепперони'])])
|
||||
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
||||
assert sorted(ru_lemmatizer.noun(text)) == lemmas
|
||||
|
||||
|
||||
@pytest.mark.models('ru')
|
||||
@pytest.mark.parametrize('text,pos,morphology,lemma', [('рой', 'NOUN', None, 'рой'),
|
||||
('рой', 'VERB', None, 'рыть'),
|
||||
('клей', 'NOUN', None, 'клей'),
|
||||
('клей', 'VERB', None, 'клеить'),
|
||||
('три', 'NUM', None, 'три'),
|
||||
('кос', 'NOUN', {'Number': 'Sing'}, 'кос'),
|
||||
('кос', 'NOUN', {'Number': 'Plur'}, 'коса'),
|
||||
('кос', 'ADJ', None, 'косой'),
|
||||
('потом', 'NOUN', None, 'пот'),
|
||||
('потом', 'ADV', None, 'потом')
|
||||
])
|
||||
def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma):
|
||||
assert ru_lemmatizer(text, pos, morphology) == [lemma]
|
||||
|
||||
|
||||
@pytest.mark.models('ru')
|
||||
@pytest.mark.parametrize('text,morphology,lemma', [('гвоздики', {'Gender': 'Fem'}, 'гвоздика'),
|
||||
('гвоздики', {'Gender': 'Masc'}, 'гвоздик'),
|
||||
('вина', {'Gender': 'Fem'}, 'вина'),
|
||||
('вина', {'Gender': 'Neut'}, 'вино')
|
||||
])
|
||||
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
|
||||
assert ru_lemmatizer.noun(text, morphology) == [lemma]
|
||||
|
||||
|
||||
# @pytest.mark.models('ru')
|
||||
# def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||
# assert ru_lemmatizer.punct('“') == ['"']
|
||||
# assert ru_lemmatizer.punct('“') == ['"']
|
||||
#
|
||||
#
|
||||
# @pytest.mark.models('ru')
|
||||
# def test_ru_lemmatizer_lemma_assignment(RU):
|
||||
# text = "А роза упала на лапу Азора."
|
||||
# doc = RU.make_doc(text)
|
||||
# RU.tagger(doc)
|
||||
# assert all(t.lemma_ != '' for t in doc)
|
131
spacy/tests/lang/ru/test_punct.py
Normal file
131
spacy/tests/lang/ru/test_punct.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
# coding: utf-8
|
||||
"""Test that open, closed and paired punctuation is split off correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
from ....util import compile_prefix_regex
|
||||
from ....lang.punctuation import TOKENIZER_PREFIXES
|
||||
|
||||
|
||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||
def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert len(tokens) == len(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
|
||||
tokens = ru_tokenizer(punct + text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == punct
|
||||
assert tokens[1].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
|
||||
tokens = ru_tokenizer(text + punct)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('punct_add', ["`"])
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text):
|
||||
tokens = ru_tokenizer(punct + punct_add + text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct
|
||||
assert tokens[1].text == punct_add
|
||||
assert tokens[2].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('punct_add', ["'"])
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text):
|
||||
tokens = ru_tokenizer(text + punct + punct_add)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == punct
|
||||
assert tokens[2].text == punct_add
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
|
||||
tokens = ru_tokenizer(punct + punct + punct + text)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].text == punct
|
||||
assert tokens[3].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Привет"])
|
||||
def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
|
||||
tokens = ru_tokenizer(text + punct + punct + punct)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == punct
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'Тест"])
|
||||
def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Тест''"])
|
||||
def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
tokens_punct = ru_tokenizer("''")
|
||||
assert len(tokens_punct) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('text', ["Тест"])
|
||||
def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
|
||||
punct_close, text):
|
||||
tokens = ru_tokenizer(punct_open + text + punct_close)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct_open
|
||||
assert tokens[1].text == text
|
||||
assert tokens[2].text == punct_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||
@pytest.mark.parametrize('text', ["Тест"])
|
||||
def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
|
||||
punct_open2, punct_close2, text):
|
||||
tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].text == punct_open2
|
||||
assert tokens[1].text == punct_open
|
||||
assert tokens[2].text == text
|
||||
assert tokens[3].text == punct_close
|
||||
assert tokens[4].text == punct_close2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', [("Тест.")])
|
||||
def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert tokens[1].text == "."
|
||||
|
||||
|
||||
def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
|
||||
text = "(Раз, два, три, проверка)."
|
||||
tokens = ru_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].text == "."
|
Loading…
Reference in New Issue
Block a user