From b5af0fe836945ce4b1b63dbd10606d2fe2ad9789 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 11 May 2023 11:54:16 +0200 Subject: [PATCH] Revert "Use Latin normalization for Serbian attrs (#12608)" (#12621) This reverts commit 6f314f99c42c3503b89391798a58befbbc23bee4. We are reverting this until we can support this normalization more consistently across vectors, training corpora, and lemmatizer data. --- spacy/lang/sr/lex_attrs.py | 55 +------------------------- spacy/lang/sr/tokenizer_exceptions.py | 3 -- spacy/tests/lang/sr/test_exceptions.py | 12 +++--- spacy/tests/lang/sr/test_lex_attrs.py | 17 -------- 4 files changed, 8 insertions(+), 79 deletions(-) delete mode 100644 spacy/tests/lang/sr/test_lex_attrs.py diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py index a356a6a7a..dc48909bc 100644 --- a/spacy/lang/sr/lex_attrs.py +++ b/spacy/lang/sr/lex_attrs.py @@ -1,4 +1,4 @@ -from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX +from ...attrs import LIKE_NUM _num_words = [ @@ -63,55 +63,4 @@ def like_num(text): return False -def _cyr_to_latin_norm(text): - # fmt: off - # source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py - SR_CYR_TO_LAT_DICT = { - u'А': u'A', u'а': u'a', - u'Б': u'B', u'б': u'b', - u'В': u'V', u'в': u'v', - u'Г': u'G', u'г': u'g', - u'Д': u'D', u'д': u'd', - u'Ђ': u'Đ', u'ђ': u'đ', - u'Е': u'E', u'е': u'e', - u'Ж': u'Ž', u'ж': u'ž', - u'З': u'Z', u'з': u'z', - u'И': u'I', u'и': u'i', - u'Ј': u'J', u'ј': u'j', - u'К': u'K', u'к': u'k', - u'Л': u'L', u'л': u'l', - u'Љ': u'Lj', u'љ': u'lj', - u'М': u'M', u'м': u'm', - u'Н': u'N', u'н': u'n', - u'Њ': u'Nj', u'њ': u'nj', - u'О': u'O', u'о': u'o', - u'П': u'P', u'п': u'p', - u'Р': u'R', u'р': u'r', - u'С': u'S', u'с': u's', - u'Т': u'T', u'т': u't', - u'Ћ': u'Ć', u'ћ': u'ć', - u'У': u'U', u'у': u'u', - u'Ф': u'F', u'ф': u'f', - u'Х': u'H', u'х': u'h', - u'Ц': u'C', u'ц': u'c', - u'Ч': u'Č', u'ч': u'č', - u'Џ': u'Dž', u'џ': u'dž', - u'Ш': u'Š', u'ш': u'š', - } - # fmt: on - return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text) - - -def norm(text): - return _cyr_to_latin_norm(text).lower() - - -def prefix(text): - return _cyr_to_latin_norm(text)[0] - - -def suffix(text): - return _cyr_to_latin_norm(text)[-3:] - - -LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix} +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index 053306088..dcaa3e239 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,4 +1,3 @@ -from .lex_attrs import _cyr_to_latin_norm from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM from ...util import update_exc @@ -90,7 +89,5 @@ _slang_exc = [ for slang_desc in _slang_exc: _exc[slang_desc[ORTH]] = [slang_desc] -for _exc_key in _exc: - _exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM]) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py index e8819e628..fa92e5e2d 100644 --- a/spacy/tests/lang/sr/test_exceptions.py +++ b/spacy/tests/lang/sr/test_exceptions.py @@ -2,15 +2,15 @@ import pytest @pytest.mark.parametrize( - "text,norms", + "text,norms,lemmas", [ - ("о.г.", ["ove godine"]), - ("чет.", ["četvrtak"]), - ("гђа", ["gospođa"]), - ("ил'", ["ili"]), + ("о.г.", ["ове године"], ["ова година"]), + ("чет.", ["четвртак"], ["четвртак"]), + ("гђа", ["госпођа"], ["госпођа"]), + ("ил'", ["или"], ["или"]), ], ) -def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms): +def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): tokens = sr_tokenizer(text) assert len(tokens) == 1 assert [token.norm_ for token in tokens] == norms diff --git a/spacy/tests/lang/sr/test_lex_attrs.py b/spacy/tests/lang/sr/test_lex_attrs.py deleted file mode 100644 index 4a8039df5..000000000 --- a/spacy/tests/lang/sr/test_lex_attrs.py +++ /dev/null @@ -1,17 +0,0 @@ -import pytest - - -@pytest.mark.parametrize( - "text,like_num,norm,prefix,suffix", - [ - ("нула", True, "nula", "n", "ula"), - ("Казна", False, "kazna", "K", "zna"), - ], -) -def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix): - tokens = sr_tokenizer(text) - assert len(tokens) == 1 - assert tokens[0].like_num == like_num - assert tokens[0].norm_ == norm - assert tokens[0].prefix_ == prefix - assert tokens[0].suffix_ == suffix