Fixed tokenizer: added char classes; added first lemmatizer and

tokenizer tests
2025-09-15 16:42:36 +03:00 · 2017-11-21 22:23:59 +03:00 · 2017-11-21 22:23:59 +03:00 · 81314f8659
commit 81314f8659
parent 52ee1f9bf9
5 changed files with 220 additions and 5 deletions
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@ -15,9 +15,11 @@ _hebrew = r'[\p{L}&&\p{Hebrew}]'
 _latin_lower = r'[\p{Ll}&&\p{Latin}]'
 _latin_upper = r'[\p{Lu}&&\p{Latin}]'
 _latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
+_russian_lower = r'[ёа-я]'
+_russian_upper = r'[ЁА-Я]'

-_upper = [_latin_upper]
-_lower = [_latin_lower]
+_upper = [_latin_upper, _russian_upper]
+_lower = [_latin_lower, _russian_lower]
 _uncased = [_bengali, _hebrew]

 ALPHA = merge_char_classes(_upper + _lower + _uncased)
@ -27,8 +29,9 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased)

 _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
          'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
-          'TB T G M K %')
-_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
+          'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
+          'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
+_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽'

 # These expressions contain various unicode variations, including characters
 # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -15,7 +15,7 @@ from .. import util
 # here if it's using spaCy's tokenizer (not a different library)
 # TODO: re-implement generic tokenizer tests
 _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
-              'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
+              'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'xx']
 _models = {'en': ['en_core_web_sm'],
           'de': ['de_core_news_md'],
           'fr': ['fr_core_news_sm'],
@ -40,6 +40,11 @@ def FR(request):
    return load_test_model(request.param)


+@pytest.fixture()
+def RU(request):
+    return util.get_lang_class('ru')
+
+
 #@pytest.fixture(params=_languages)
 #def tokenizer(request):
    #lang = util.get_lang_class(request.param)
@ -137,6 +142,11 @@ def th_tokenizer():
    return util.get_lang_class('th').Defaults.create_tokenizer()


+@pytest.fixture
+def ru_tokenizer():
+    return util.get_lang_class('ru').Defaults.create_tokenizer()
+
+
@pytest.fixture
 def stringstore():
    return StringStore()
--- a/spacy/tests/lang/ru/init.py
+++ b/spacy/tests/lang/ru/init.py
--- a/spacy/tests/lang/ru/test_lemmatizer.py
+++ b/spacy/tests/lang/ru/test_lemmatizer.py
@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+from ....tokens.doc import Doc
+
+
+@pytest.fixture
+def ru_lemmatizer(RU):
+    return RU.Defaults.create_lemmatizer()
+
+
+# @pytest.mark.models('ru')
+# def test_doc_lemmatization(RU):
+#     doc = Doc(RU.vocab, words=['мама', 'мыла', 'раму'])
+#     doc[0].tag_ = 'NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing'
+#     doc[1].tag_ = 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act'
+#     doc[2].tag_ = 'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing'
+#
+#     lemmas = [token.lemma_ for token in doc]
+#     assert lemmas == ['мама', 'мыть', 'рама']
+
+
+@pytest.mark.models('ru')
+@pytest.mark.parametrize('text,lemmas', [('гвоздики', ['гвоздик', 'гвоздика']),
+                                         ('люди', ['человек']),
+                                         ('реки', ['река']),
+                                         ('кольцо', ['кольцо']),
+                                         ('пепперони', ['пепперони'])])
+def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
+    assert sorted(ru_lemmatizer.noun(text)) == lemmas
+
+
+@pytest.mark.models('ru')
+@pytest.mark.parametrize('text,pos,morphology,lemma', [('рой', 'NOUN', None, 'рой'),
+                                                       ('рой', 'VERB', None, 'рыть'),
+                                                       ('клей', 'NOUN', None, 'клей'),
+                                                       ('клей', 'VERB', None, 'клеить'),
+                                                       ('три', 'NUM', None, 'три'),
+                                                       ('кос', 'NOUN', {'Number': 'Sing'}, 'кос'),
+                                                       ('кос', 'NOUN', {'Number': 'Plur'}, 'коса'),
+                                                       ('кос', 'ADJ', None, 'косой'),
+                                                       ('потом', 'NOUN', None, 'пот'),
+                                                       ('потом', 'ADV', None, 'потом')
+                                                       ])
+def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma):
+    assert ru_lemmatizer(text, pos, morphology) == [lemma]
+
+
+@pytest.mark.models('ru')
+@pytest.mark.parametrize('text,morphology,lemma', [('гвоздики', {'Gender': 'Fem'}, 'гвоздика'),
+                                                   ('гвоздики', {'Gender': 'Masc'}, 'гвоздик'),
+                                                   ('вина', {'Gender': 'Fem'}, 'вина'),
+                                                   ('вина', {'Gender': 'Neut'}, 'вино')
+                                                   ])
+def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
+    assert ru_lemmatizer.noun(text, morphology) == [lemma]
+
+
+# @pytest.mark.models('ru')
+# def test_ru_lemmatizer_punct(ru_lemmatizer):
+#     assert ru_lemmatizer.punct('“') == ['"']
+#     assert ru_lemmatizer.punct('“') == ['"']
+#
+#
+# @pytest.mark.models('ru')
+# def test_ru_lemmatizer_lemma_assignment(RU):
+#     text = "А роза упала на лапу Азора."
+#     doc = RU.make_doc(text)
+#     RU.tagger(doc)
+#     assert all(t.lemma_ != '' for t in doc)
--- a/spacy/tests/lang/ru/test_punct.py
+++ b/spacy/tests/lang/ru/test_punct.py
@ -0,0 +1,131 @@
+# coding: utf-8
+"""Test that open, closed and paired punctuation is split off correctly."""
+
+
+from __future__ import unicode_literals
+
+import pytest
+
+from ....util import compile_prefix_regex
+from ....lang.punctuation import TOKENIZER_PREFIXES
+
+
+PUNCT_OPEN = ['(', '[', '{', '*']
+PUNCT_CLOSE = [')', ']', '}', '*']
+PUNCT_PAIRED = [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
+
+
+@pytest.mark.parametrize('text', ["(", "((", "<"])
+def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text):
+    tokens = ru_tokenizer(text)
+    assert len(tokens) == len(text)
+
+
+@pytest.mark.parametrize('punct', PUNCT_OPEN)
+@pytest.mark.parametrize('text', ["Привет"])
+def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
+    tokens = ru_tokenizer(punct + text)
+    assert len(tokens) == 2
+    assert tokens[0].text == punct
+    assert tokens[1].text == text
+
+
+@pytest.mark.parametrize('punct', PUNCT_CLOSE)
+@pytest.mark.parametrize('text', ["Привет"])
+def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
+    tokens = ru_tokenizer(text + punct)
+    assert len(tokens) == 2
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize('punct', PUNCT_OPEN)
+@pytest.mark.parametrize('punct_add', ["`"])
+@pytest.mark.parametrize('text', ["Привет"])
+def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text):
+    tokens = ru_tokenizer(punct + punct_add + text)
+    assert len(tokens) == 3
+    assert tokens[0].text == punct
+    assert tokens[1].text == punct_add
+    assert tokens[2].text == text
+
+
+@pytest.mark.parametrize('punct', PUNCT_CLOSE)
+@pytest.mark.parametrize('punct_add', ["'"])
+@pytest.mark.parametrize('text', ["Привет"])
+def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text):
+    tokens = ru_tokenizer(text + punct + punct_add)
+    assert len(tokens) == 3
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+    assert tokens[2].text == punct_add
+
+
+@pytest.mark.parametrize('punct', PUNCT_OPEN)
+@pytest.mark.parametrize('text', ["Привет"])
+def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
+    tokens = ru_tokenizer(punct + punct + punct + text)
+    assert len(tokens) == 4
+    assert tokens[0].text == punct
+    assert tokens[3].text == text
+
+
+@pytest.mark.parametrize('punct', PUNCT_CLOSE)
+@pytest.mark.parametrize('text', ["Привет"])
+def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
+    tokens = ru_tokenizer(text + punct + punct + punct)
+    assert len(tokens) == 4
+    assert tokens[0].text == text
+    assert tokens[1].text == punct
+
+
+@pytest.mark.parametrize('text', ["'Тест"])
+def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text):
+    tokens = ru_tokenizer(text)
+    assert len(tokens) == 2
+    assert tokens[0].text == "'"
+
+
+@pytest.mark.parametrize('text', ["Тест''"])
+def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
+    tokens = ru_tokenizer(text)
+    assert len(tokens) == 2
+    tokens_punct = ru_tokenizer("''")
+    assert len(tokens_punct) == 1
+
+
+@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
+@pytest.mark.parametrize('text', ["Тест"])
+def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
+                                           punct_close, text):
+    tokens = ru_tokenizer(punct_open + text + punct_close)
+    assert len(tokens) == 3
+    assert tokens[0].text == punct_open
+    assert tokens[1].text == text
+    assert tokens[2].text == punct_close
+
+
+@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
+@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
+@pytest.mark.parametrize('text', ["Тест"])
+def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
+                                  punct_open2, punct_close2, text):
+    tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
+    assert len(tokens) == 5
+    assert tokens[0].text == punct_open2
+    assert tokens[1].text == punct_open
+    assert tokens[2].text == text
+    assert tokens[3].text == punct_close
+    assert tokens[4].text == punct_close2
+
+
+@pytest.mark.parametrize('text', [("Тест.")])
+def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text):
+    tokens = ru_tokenizer(text)
+    assert tokens[1].text == "."
+
+
+def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
+    text = "(Раз, два, три, проверка)."
+    tokens = ru_tokenizer(text)
+    assert tokens[len(tokens) - 1].text == "."