mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fixed tokenizer: added char classes; added first lemmatizer and
tokenizer tests
This commit is contained in:
parent
52ee1f9bf9
commit
81314f8659
|
@ -15,9 +15,11 @@ _hebrew = r'[\p{L}&&\p{Hebrew}]'
|
||||||
_latin_lower = r'[\p{Ll}&&\p{Latin}]'
|
_latin_lower = r'[\p{Ll}&&\p{Latin}]'
|
||||||
_latin_upper = r'[\p{Lu}&&\p{Latin}]'
|
_latin_upper = r'[\p{Lu}&&\p{Latin}]'
|
||||||
_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
|
_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
|
||||||
|
_russian_lower = r'[ёа-я]'
|
||||||
|
_russian_upper = r'[ЁА-Я]'
|
||||||
|
|
||||||
_upper = [_latin_upper]
|
_upper = [_latin_upper, _russian_upper]
|
||||||
_lower = [_latin_lower]
|
_lower = [_latin_lower, _russian_lower]
|
||||||
_uncased = [_bengali, _hebrew]
|
_uncased = [_bengali, _hebrew]
|
||||||
|
|
||||||
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
||||||
|
@ -27,8 +29,9 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
||||||
|
|
||||||
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
|
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
|
||||||
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
||||||
'TB T G M K %')
|
'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
|
||||||
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
|
'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
|
||||||
|
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽'
|
||||||
|
|
||||||
# These expressions contain various unicode variations, including characters
|
# These expressions contain various unicode variations, including characters
|
||||||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||||||
|
|
|
@ -15,7 +15,7 @@ from .. import util
|
||||||
# here if it's using spaCy's tokenizer (not a different library)
|
# here if it's using spaCy's tokenizer (not a different library)
|
||||||
# TODO: re-implement generic tokenizer tests
|
# TODO: re-implement generic tokenizer tests
|
||||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
|
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
|
||||||
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
|
'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'xx']
|
||||||
_models = {'en': ['en_core_web_sm'],
|
_models = {'en': ['en_core_web_sm'],
|
||||||
'de': ['de_core_news_md'],
|
'de': ['de_core_news_md'],
|
||||||
'fr': ['fr_core_news_sm'],
|
'fr': ['fr_core_news_sm'],
|
||||||
|
@ -40,6 +40,11 @@ def FR(request):
|
||||||
return load_test_model(request.param)
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def RU(request):
|
||||||
|
return util.get_lang_class('ru')
|
||||||
|
|
||||||
|
|
||||||
#@pytest.fixture(params=_languages)
|
#@pytest.fixture(params=_languages)
|
||||||
#def tokenizer(request):
|
#def tokenizer(request):
|
||||||
#lang = util.get_lang_class(request.param)
|
#lang = util.get_lang_class(request.param)
|
||||||
|
@ -137,6 +142,11 @@ def th_tokenizer():
|
||||||
return util.get_lang_class('th').Defaults.create_tokenizer()
|
return util.get_lang_class('th').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ru_tokenizer():
|
||||||
|
return util.get_lang_class('ru').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def stringstore():
|
def stringstore():
|
||||||
return StringStore()
|
return StringStore()
|
||||||
|
|
0
spacy/tests/lang/ru/__init__.py
Normal file
0
spacy/tests/lang/ru/__init__.py
Normal file
71
spacy/tests/lang/ru/test_lemmatizer.py
Normal file
71
spacy/tests/lang/ru/test_lemmatizer.py
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from ....tokens.doc import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ru_lemmatizer(RU):
|
||||||
|
return RU.Defaults.create_lemmatizer()
|
||||||
|
|
||||||
|
|
||||||
|
# @pytest.mark.models('ru')
|
||||||
|
# def test_doc_lemmatization(RU):
|
||||||
|
# doc = Doc(RU.vocab, words=['мама', 'мыла', 'раму'])
|
||||||
|
# doc[0].tag_ = 'NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing'
|
||||||
|
# doc[1].tag_ = 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act'
|
||||||
|
# doc[2].tag_ = 'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing'
|
||||||
|
#
|
||||||
|
# lemmas = [token.lemma_ for token in doc]
|
||||||
|
# assert lemmas == ['мама', 'мыть', 'рама']
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('ru')
|
||||||
|
@pytest.mark.parametrize('text,lemmas', [('гвоздики', ['гвоздик', 'гвоздика']),
|
||||||
|
('люди', ['человек']),
|
||||||
|
('реки', ['река']),
|
||||||
|
('кольцо', ['кольцо']),
|
||||||
|
('пепперони', ['пепперони'])])
|
||||||
|
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
||||||
|
assert sorted(ru_lemmatizer.noun(text)) == lemmas
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('ru')
|
||||||
|
@pytest.mark.parametrize('text,pos,morphology,lemma', [('рой', 'NOUN', None, 'рой'),
|
||||||
|
('рой', 'VERB', None, 'рыть'),
|
||||||
|
('клей', 'NOUN', None, 'клей'),
|
||||||
|
('клей', 'VERB', None, 'клеить'),
|
||||||
|
('три', 'NUM', None, 'три'),
|
||||||
|
('кос', 'NOUN', {'Number': 'Sing'}, 'кос'),
|
||||||
|
('кос', 'NOUN', {'Number': 'Plur'}, 'коса'),
|
||||||
|
('кос', 'ADJ', None, 'косой'),
|
||||||
|
('потом', 'NOUN', None, 'пот'),
|
||||||
|
('потом', 'ADV', None, 'потом')
|
||||||
|
])
|
||||||
|
def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma):
|
||||||
|
assert ru_lemmatizer(text, pos, morphology) == [lemma]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('ru')
|
||||||
|
@pytest.mark.parametrize('text,morphology,lemma', [('гвоздики', {'Gender': 'Fem'}, 'гвоздика'),
|
||||||
|
('гвоздики', {'Gender': 'Masc'}, 'гвоздик'),
|
||||||
|
('вина', {'Gender': 'Fem'}, 'вина'),
|
||||||
|
('вина', {'Gender': 'Neut'}, 'вино')
|
||||||
|
])
|
||||||
|
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
|
||||||
|
assert ru_lemmatizer.noun(text, morphology) == [lemma]
|
||||||
|
|
||||||
|
|
||||||
|
# @pytest.mark.models('ru')
|
||||||
|
# def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||||
|
# assert ru_lemmatizer.punct('“') == ['"']
|
||||||
|
# assert ru_lemmatizer.punct('“') == ['"']
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# @pytest.mark.models('ru')
|
||||||
|
# def test_ru_lemmatizer_lemma_assignment(RU):
|
||||||
|
# text = "А роза упала на лапу Азора."
|
||||||
|
# doc = RU.make_doc(text)
|
||||||
|
# RU.tagger(doc)
|
||||||
|
# assert all(t.lemma_ != '' for t in doc)
|
131
spacy/tests/lang/ru/test_punct.py
Normal file
131
spacy/tests/lang/ru/test_punct.py
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Test that open, closed and paired punctuation is split off correctly."""
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ....util import compile_prefix_regex
|
||||||
|
from ....lang.punctuation import TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
|
PUNCT_OPEN = ['(', '[', '{', '*']
|
||||||
|
PUNCT_CLOSE = [')', ']', '}', '*']
|
||||||
|
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||||
|
def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text):
|
||||||
|
tokens = ru_tokenizer(text)
|
||||||
|
assert len(tokens) == len(text)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||||
|
@pytest.mark.parametrize('text', ["Привет"])
|
||||||
|
def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
|
||||||
|
tokens = ru_tokenizer(punct + text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == punct
|
||||||
|
assert tokens[1].text == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||||
|
@pytest.mark.parametrize('text', ["Привет"])
|
||||||
|
def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
|
||||||
|
tokens = ru_tokenizer(text + punct)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == text
|
||||||
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||||
|
@pytest.mark.parametrize('punct_add', ["`"])
|
||||||
|
@pytest.mark.parametrize('text', ["Привет"])
|
||||||
|
def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text):
|
||||||
|
tokens = ru_tokenizer(punct + punct_add + text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == punct
|
||||||
|
assert tokens[1].text == punct_add
|
||||||
|
assert tokens[2].text == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||||
|
@pytest.mark.parametrize('punct_add', ["'"])
|
||||||
|
@pytest.mark.parametrize('text', ["Привет"])
|
||||||
|
def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text):
|
||||||
|
tokens = ru_tokenizer(text + punct + punct_add)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == text
|
||||||
|
assert tokens[1].text == punct
|
||||||
|
assert tokens[2].text == punct_add
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||||
|
@pytest.mark.parametrize('text', ["Привет"])
|
||||||
|
def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
|
||||||
|
tokens = ru_tokenizer(punct + punct + punct + text)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
assert tokens[0].text == punct
|
||||||
|
assert tokens[3].text == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||||
|
@pytest.mark.parametrize('text', ["Привет"])
|
||||||
|
def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
|
||||||
|
tokens = ru_tokenizer(text + punct + punct + punct)
|
||||||
|
assert len(tokens) == 4
|
||||||
|
assert tokens[0].text == text
|
||||||
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["'Тест"])
|
||||||
|
def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text):
|
||||||
|
tokens = ru_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == "'"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["Тест''"])
|
||||||
|
def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
|
||||||
|
tokens = ru_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
tokens_punct = ru_tokenizer("''")
|
||||||
|
assert len(tokens_punct) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
|
@pytest.mark.parametrize('text', ["Тест"])
|
||||||
|
def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
|
||||||
|
punct_close, text):
|
||||||
|
tokens = ru_tokenizer(punct_open + text + punct_close)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[0].text == punct_open
|
||||||
|
assert tokens[1].text == text
|
||||||
|
assert tokens[2].text == punct_close
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
|
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||||
|
@pytest.mark.parametrize('text', ["Тест"])
|
||||||
|
def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
|
||||||
|
punct_open2, punct_close2, text):
|
||||||
|
tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||||
|
assert len(tokens) == 5
|
||||||
|
assert tokens[0].text == punct_open2
|
||||||
|
assert tokens[1].text == punct_open
|
||||||
|
assert tokens[2].text == text
|
||||||
|
assert tokens[3].text == punct_close
|
||||||
|
assert tokens[4].text == punct_close2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', [("Тест.")])
|
||||||
|
def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text):
|
||||||
|
tokens = ru_tokenizer(text)
|
||||||
|
assert tokens[1].text == "."
|
||||||
|
|
||||||
|
|
||||||
|
def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
|
||||||
|
text = "(Раз, два, три, проверка)."
|
||||||
|
tokens = ru_tokenizer(text)
|
||||||
|
assert tokens[len(tokens) - 1].text == "."
|
Loading…
Reference in New Issue
Block a user