Handle Cyrillic combining diacritics (#10837)

* Handle Russian, Ukrainian and Bulgarian

* Corrections

* Correction

* Correction to comment

* Changes based on review

* Correction

* Reverted irrelevant change in punctuation.py

* Remove unnecessary group

* Reverted accidental change
This commit is contained in:
Richard Hudson 2022-06-28 15:35:32 +02:00 committed by GitHub
parent 8ffff18ac4
commit a9559e7435
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 86 additions and 2 deletions

View File

@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from ...attrs import LANG from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Bulgarian(Language): class Bulgarian(Language):

View File

@ -258,6 +258,10 @@ ALPHA = group_chars(
ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased)
ALPHA_UPPER = group_chars(_upper + _uncased) ALPHA_UPPER = group_chars(_upper + _uncased)
_combining_diacritics = r"\u0300-\u036f"
COMBINING_DIACRITICS = _combining_diacritics
_units = ( _units = (
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft " "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb " "kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "

View File

@ -1,5 +1,5 @@
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )
# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
# to mark stressed syllables in words where stress is distinctive. Such languages
# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
# place of the standard ones.
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
]
COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
),
r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
),
r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
]

View File

@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Russian(Language): class Russian(Language):

View File

@ -6,6 +6,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import UkrainianLemmatizer from .lemmatizer import UkrainianLemmatizer
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
@ -13,6 +15,8 @@ class UkrainianDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS tokenizer_exceptions = TOKENIZER_EXCEPTIONS
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
class Ukrainian(Language): class Ukrainian(Language):

View File

@ -0,0 +1,8 @@
import pytest
def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer):
text = "Ня̀маше яйца̀. Ня̀маше яйца̀."
tokens = bg_tokenizer(text)
assert tokens[1].text == "яйца̀"
assert tokens[2].text == "."

View File

@ -1,3 +1,4 @@
from string import punctuation
import pytest import pytest
@ -122,3 +123,36 @@ def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
text = "(Раз, два, три, проверка)." text = "(Раз, два, три, проверка)."
tokens = ru_tokenizer(text) tokens = ru_tokenizer(text)
assert tokens[len(tokens) - 1].text == "." assert tokens[len(tokens) - 1].text == "."
@pytest.mark.parametrize(
"text",
[
"рекоменду́я подда́ть жару́. Самого́ Баргамота",
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́. САМОГО́ БАРГАМОТА",
"рекоменду̍я подда̍ть жару̍.Самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍.'Самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍,самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍:самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍. самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍, самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍: самого̍ Баргамота",
"рекоменду̍я подда̍ть жару̍-самого̍ Баргамота",
],
)
def test_ru_tokenizer_handles_final_diacritics(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert tokens[2].text in ("жару́", "ЖАРУ́", "жару̍")
assert tokens[3].text in punctuation
@pytest.mark.parametrize(
"text",
[
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́.САМОГО́ БАРГАМОТА",
"рекоменду̍я подда̍ть жару́.самого́ Баргамота",
],
)
def test_ru_tokenizer_handles_final_diacritic_and_period(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert tokens[2].text.lower() == "жару́.самого́"

View File

@ -140,3 +140,10 @@ def test_uk_tokenizer_splits_bracket_period(uk_tokenizer):
text = "(Раз, два, три, проверка)." text = "(Раз, два, три, проверка)."
tokens = uk_tokenizer(text) tokens = uk_tokenizer(text)
assert tokens[len(tokens) - 1].text == "." assert tokens[len(tokens) - 1].text == "."
def test_uk_tokenizer_handles_final_diacritics(uk_tokenizer):
text = "Хлібі́в не було́. Хлібі́в не було́."
tokens = uk_tokenizer(text)
assert tokens[2].text == "було́"
assert tokens[3].text == "."