mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Use Latin normalization for Serbian attrs (#12608)
* Use Latin normalization for Serbian attrs Use Latin normalization for Serbian `NORM`, `PREFIX`, and `SUFFIX`. * Update NORMs in tokenizer exceptions and related tests * Add tests for all custom lex attrs * Remove unused imports
This commit is contained in:
parent
cbc6bcf434
commit
6f314f99c4
|
@ -1,4 +1,4 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX
|
||||
|
||||
|
||||
_num_words = [
|
||||
|
@ -63,4 +63,55 @@ def like_num(text):
|
|||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||
def _cyr_to_latin_norm(text):
|
||||
# fmt: off
|
||||
# source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py
|
||||
SR_CYR_TO_LAT_DICT = {
|
||||
u'А': u'A', u'а': u'a',
|
||||
u'Б': u'B', u'б': u'b',
|
||||
u'В': u'V', u'в': u'v',
|
||||
u'Г': u'G', u'г': u'g',
|
||||
u'Д': u'D', u'д': u'd',
|
||||
u'Ђ': u'Đ', u'ђ': u'đ',
|
||||
u'Е': u'E', u'е': u'e',
|
||||
u'Ж': u'Ž', u'ж': u'ž',
|
||||
u'З': u'Z', u'з': u'z',
|
||||
u'И': u'I', u'и': u'i',
|
||||
u'Ј': u'J', u'ј': u'j',
|
||||
u'К': u'K', u'к': u'k',
|
||||
u'Л': u'L', u'л': u'l',
|
||||
u'Љ': u'Lj', u'љ': u'lj',
|
||||
u'М': u'M', u'м': u'm',
|
||||
u'Н': u'N', u'н': u'n',
|
||||
u'Њ': u'Nj', u'њ': u'nj',
|
||||
u'О': u'O', u'о': u'o',
|
||||
u'П': u'P', u'п': u'p',
|
||||
u'Р': u'R', u'р': u'r',
|
||||
u'С': u'S', u'с': u's',
|
||||
u'Т': u'T', u'т': u't',
|
||||
u'Ћ': u'Ć', u'ћ': u'ć',
|
||||
u'У': u'U', u'у': u'u',
|
||||
u'Ф': u'F', u'ф': u'f',
|
||||
u'Х': u'H', u'х': u'h',
|
||||
u'Ц': u'C', u'ц': u'c',
|
||||
u'Ч': u'Č', u'ч': u'č',
|
||||
u'Џ': u'Dž', u'џ': u'dž',
|
||||
u'Ш': u'Š', u'ш': u'š',
|
||||
}
|
||||
# fmt: on
|
||||
return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text)
|
||||
|
||||
|
||||
def norm(text):
|
||||
return _cyr_to_latin_norm(text).lower()
|
||||
|
||||
|
||||
def prefix(text):
|
||||
return _cyr_to_latin_norm(text)[0]
|
||||
|
||||
|
||||
def suffix(text):
|
||||
return _cyr_to_latin_norm(text)[-3:]
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from .lex_attrs import _cyr_to_latin_norm
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
@ -89,5 +90,7 @@ _slang_exc = [
|
|||
for slang_desc in _slang_exc:
|
||||
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||
|
||||
for _exc_key in _exc:
|
||||
_exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM])
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -2,15 +2,15 @@ import pytest
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,norms,lemmas",
|
||||
"text,norms",
|
||||
[
|
||||
("о.г.", ["ове године"], ["ова година"]),
|
||||
("чет.", ["четвртак"], ["четвртак"]),
|
||||
("гђа", ["госпођа"], ["госпођа"]),
|
||||
("ил'", ["или"], ["или"]),
|
||||
("о.г.", ["ove godine"]),
|
||||
("чет.", ["četvrtak"]),
|
||||
("гђа", ["gospođa"]),
|
||||
("ил'", ["ili"]),
|
||||
],
|
||||
)
|
||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
|
||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms):
|
||||
tokens = sr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
|
17
spacy/tests/lang/sr/test_lex_attrs.py
Normal file
17
spacy/tests/lang/sr/test_lex_attrs.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,like_num,norm,prefix,suffix",
|
||||
[
|
||||
("нула", True, "nula", "n", "ula"),
|
||||
("Казна", False, "kazna", "K", "zna"),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix):
|
||||
tokens = sr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == like_num
|
||||
assert tokens[0].norm_ == norm
|
||||
assert tokens[0].prefix_ == prefix
|
||||
assert tokens[0].suffix_ == suffix
|
Loading…
Reference in New Issue
Block a user