mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Handle Cyrillic combining diacritics (#10837)
* Handle Russian, Ukrainian and Bulgarian * Corrections * Correction * Correction to comment * Changes based on review * Correction * Reverted irrelevant change in punctuation.py * Remove unnecessary group * Reverted accidental change
This commit is contained in:
parent
8ffff18ac4
commit
a9559e7435
|
@ -2,7 +2,8 @@ from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||||
|
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
@ -16,6 +17,8 @@ class BulgarianDefaults(BaseDefaults):
|
||||||
|
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
|
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||||
|
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Bulgarian(Language):
|
class Bulgarian(Language):
|
||||||
|
|
|
@ -258,6 +258,10 @@ ALPHA = group_chars(
|
||||||
ALPHA_LOWER = group_chars(_lower + _uncased)
|
ALPHA_LOWER = group_chars(_lower + _uncased)
|
||||||
ALPHA_UPPER = group_chars(_upper + _uncased)
|
ALPHA_UPPER = group_chars(_upper + _uncased)
|
||||||
|
|
||||||
|
_combining_diacritics = r"\u0300-\u036f"
|
||||||
|
|
||||||
|
COMBINING_DIACRITICS = _combining_diacritics
|
||||||
|
|
||||||
_units = (
|
_units = (
|
||||||
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||||
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
|
"kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb "
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS, COMBINING_DIACRITICS
|
||||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,3 +44,23 @@ TOKENIZER_INFIXES = (
|
||||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Some languages e.g. written with the Cyrillic alphabet permit the use of diacritics
|
||||||
|
# to mark stressed syllables in words where stress is distinctive. Such languages
|
||||||
|
# should use the COMBINING_DIACRITICS... suffix and infix regex lists in
|
||||||
|
# place of the standard ones.
|
||||||
|
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES = list(TOKENIZER_SUFFIXES) + [
|
||||||
|
r"(?<=[{a}][{d}])\.".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||||
|
]
|
||||||
|
|
||||||
|
COMBINING_DIACRITICS_TOKENIZER_INFIXES = list(TOKENIZER_INFIXES) + [
|
||||||
|
r"(?<=[{al}][{d}])\.(?=[{au}{q}])".format(
|
||||||
|
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES, d=COMBINING_DIACRITICS
|
||||||
|
),
|
||||||
|
r"(?<=[{a}][{d}]),(?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||||
|
r"(?<=[{a}][{d}])(?:{h})(?=[{a}])".format(
|
||||||
|
a=ALPHA, d=COMBINING_DIACRITICS, h=HYPHENS
|
||||||
|
),
|
||||||
|
r"(?<=[{a}][{d}])[:<>=/](?=[{a}])".format(a=ALPHA, d=COMBINING_DIACRITICS),
|
||||||
|
]
|
||||||
|
|
|
@ -5,6 +5,8 @@ from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import RussianLemmatizer
|
from .lemmatizer import RussianLemmatizer
|
||||||
|
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||||
|
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,6 +14,8 @@ class RussianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||||
|
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Russian(Language):
|
class Russian(Language):
|
||||||
|
|
|
@ -6,6 +6,8 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .lemmatizer import UkrainianLemmatizer
|
from .lemmatizer import UkrainianLemmatizer
|
||||||
|
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||||
|
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,6 +15,8 @@ class UkrainianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
suffixes = COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||||
|
infixes = COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Ukrainian(Language):
|
class Ukrainian(Language):
|
||||||
|
|
8
spacy/tests/lang/bg/test_tokenizer.py
Normal file
8
spacy/tests/lang/bg/test_tokenizer.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_bg_tokenizer_handles_final_diacritics(bg_tokenizer):
|
||||||
|
text = "Ня̀маше яйца̀. Ня̀маше яйца̀."
|
||||||
|
tokens = bg_tokenizer(text)
|
||||||
|
assert tokens[1].text == "яйца̀"
|
||||||
|
assert tokens[2].text == "."
|
|
@ -1,3 +1,4 @@
|
||||||
|
from string import punctuation
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@ -122,3 +123,36 @@ def test_ru_tokenizer_splits_bracket_period(ru_tokenizer):
|
||||||
text = "(Раз, два, три, проверка)."
|
text = "(Раз, два, три, проверка)."
|
||||||
tokens = ru_tokenizer(text)
|
tokens = ru_tokenizer(text)
|
||||||
assert tokens[len(tokens) - 1].text == "."
|
assert tokens[len(tokens) - 1].text == "."
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text",
|
||||||
|
[
|
||||||
|
"рекоменду́я подда́ть жару́. Самого́ Баргамота",
|
||||||
|
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́. САМОГО́ БАРГАМОТА",
|
||||||
|
"рекоменду̍я подда̍ть жару̍.Самого̍ Баргамота",
|
||||||
|
"рекоменду̍я подда̍ть жару̍.'Самого̍ Баргамота",
|
||||||
|
"рекоменду̍я подда̍ть жару̍,самого̍ Баргамота",
|
||||||
|
"рекоменду̍я подда̍ть жару̍:самого̍ Баргамота",
|
||||||
|
"рекоменду̍я подда̍ть жару̍. самого̍ Баргамота",
|
||||||
|
"рекоменду̍я подда̍ть жару̍, самого̍ Баргамота",
|
||||||
|
"рекоменду̍я подда̍ть жару̍: самого̍ Баргамота",
|
||||||
|
"рекоменду̍я подда̍ть жару̍-самого̍ Баргамота",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_ru_tokenizer_handles_final_diacritics(ru_tokenizer, text):
|
||||||
|
tokens = ru_tokenizer(text)
|
||||||
|
assert tokens[2].text in ("жару́", "ЖАРУ́", "жару̍")
|
||||||
|
assert tokens[3].text in punctuation
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text",
|
||||||
|
[
|
||||||
|
"РЕКОМЕНДУ́Я ПОДДА́ТЬ ЖАРУ́.САМОГО́ БАРГАМОТА",
|
||||||
|
"рекоменду̍я подда̍ть жару́.самого́ Баргамота",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_ru_tokenizer_handles_final_diacritic_and_period(ru_tokenizer, text):
|
||||||
|
tokens = ru_tokenizer(text)
|
||||||
|
assert tokens[2].text.lower() == "жару́.самого́"
|
||||||
|
|
|
@ -140,3 +140,10 @@ def test_uk_tokenizer_splits_bracket_period(uk_tokenizer):
|
||||||
text = "(Раз, два, три, проверка)."
|
text = "(Раз, два, три, проверка)."
|
||||||
tokens = uk_tokenizer(text)
|
tokens = uk_tokenizer(text)
|
||||||
assert tokens[len(tokens) - 1].text == "."
|
assert tokens[len(tokens) - 1].text == "."
|
||||||
|
|
||||||
|
|
||||||
|
def test_uk_tokenizer_handles_final_diacritics(uk_tokenizer):
|
||||||
|
text = "Хлібі́в не було́. Хлібі́в не було́."
|
||||||
|
tokens = uk_tokenizer(text)
|
||||||
|
assert tokens[2].text == "було́"
|
||||||
|
assert tokens[3].text == "."
|
||||||
|
|
Loading…
Reference in New Issue
Block a user