mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Handle Cyrillic combining diacritics (#10837)
* Handle Russian, Ukrainian and Bulgarian * Corrections * Correction * Correction to comment * Changes based on review * Correction * Reverted irrelevant change in punctuation.py * Remove unnecessary group * Reverted accidental change
This commit is contained in:
parent
8ffff18ac4
commit
a9559e7435
|
@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):
|
|||
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Bulgarian(Language):
|
||||
|
|
|
@ -258,6 +258,10 @@ ALPHA = group_chars(
|
|||
ALPHA_LOWER = group_chars(_lower + _uncased)
|
||||
ALPHA_UPPER = group_chars(_upper + _uncased)
|
||||
|
||||
_combining_diacritics = r"\u0300-\u036f"
|
||||
|
||||
COMBINING_DIACRITICS = _combining_diacritics
|
||||
|
||||
_units = (
|
||||
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
|
||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
|
||||
|
||||
|
@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
|
|||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
|
||||
# to mark stressed syllables in words where stress is distinctive. Such languages
|
||||
# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
|
||||
# place of the standard ones.
|
||||
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
|
||||
r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||
]
|
||||
|
||||
COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
|
||||
r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
|
||||
),
|
||||
r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||
r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
|
||||
a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
|
||||
),
|
||||
r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||
]
|
||||
|
|
|
@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
|
@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Russian(Language):
|
||||
|
|
|
@ -6,6 +6,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import UkrainianLemmatizer
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
|
@ -13,6 +15,8 @@ class UkrainianDefaults(BaseDefaults):
|
|||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Ukrainian(Language):
|
||||
|
|
8
spacy/tests/lang/bg/test_tokenizer.py
Normal file
8
spacy/tests/lang/bg/test_tokenizer.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
import pytest
|
||||
|
||||
|
||||
def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer):
|
||||
text = "Ня̀маше яйца̀. Ня̀маше яйца̀."
|
||||
tokens = bg_tokenizer(text)
|
||||
assert tokens[1].text == "яйца̀"
|
||||
assert tokens[2].text == "."
|
|
@ -1,3 +1,4 @@
|
|||
from string import punctuation
|
||||
import pytest
|
||||
|
||||
|
||||
|
@ -122,3 +123,36 @@ def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
|
|||
text = "(Раз, два, три, проверка)."
|
||||
tokens = ru_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].text == "."
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text",
|
||||
[
|
||||
"рекоменду́я подда́ть жару́. Самого́ Баргамота",
|
||||
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́. САМОГО́ БАРГАМОТА",
|
||||
"рекоменду̍я подда̍ть жару̍.Самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍.'Самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍,самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍:самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍. самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍, самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍: самого̍ Баргамота",
|
||||
"рекоменду̍я подда̍ть жару̍-самого̍ Баргамота",
|
||||
],
|
||||
)
|
||||
def test_ru_tokenizer_handles_final_diacritics(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert tokens[2].text in ("жару́", "ЖАРУ́", "жару̍")
|
||||
assert tokens[3].text in punctuation
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text",
|
||||
[
|
||||
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́.САМОГО́ БАРГАМОТА",
|
||||
"рекоменду̍я подда̍ть жару́.самого́ Баргамота",
|
||||
],
|
||||
)
|
||||
def test_ru_tokenizer_handles_final_diacritic_and_period(ru_tokenizer, text):
|
||||
tokens = ru_tokenizer(text)
|
||||
assert tokens[2].text.lower() == "жару́.самого́"
|
||||
|
|
|
@ -140,3 +140,10 @@ def test_uk_tokenizer_splits_bracket_period(uk_tokenizer):
|
|||
text = "(Раз, два, три, проверка)."
|
||||
tokens = uk_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].text == "."
|
||||
|
||||
|
||||
def test_uk_tokenizer_handles_final_diacritics(uk_tokenizer):
|
||||
text = "Хлібі́в не було́. Хлібі́в не було́."
|
||||
tokens = uk_tokenizer(text)
|
||||
assert tokens[2].text == "було́"
|
||||
assert tokens[3].text == "."
|
||||
|
|
Loading…
Reference in New Issue
Block a user