Revert "Use Latin normalization for Serbian attrs (#12608)" (#12621)

This reverts commit 6f314f99c4. We are reverting this until we can support this normalization more consistently across vectors, training corpora, and lemmatizer data.
2025-07-16 03:02:41 +03:00 · 2023-05-11 11:54:16 +02:00 · 2023-05-11 11:54:16 +02:00 · b5af0fe836
commit b5af0fe836
parent 3252f6b13f
4 changed files with 8 additions and 79 deletions
--- a/spacy/lang/sr/lex_attrs.py
+++ b/spacy/lang/sr/lex_attrs.py
@ -1,4 +1,4 @@
-from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX
+from ...attrs import LIKE_NUM


 _num_words = [
@ -63,55 +63,4 @@ def like_num(text):
    return False


-def _cyr_to_latin_norm(text):
-    # fmt: off
-    # source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py
-    SR_CYR_TO_LAT_DICT = {
-        u'А': u'A', u'а': u'a',
-        u'Б': u'B', u'б': u'b',
-        u'В': u'V', u'в': u'v',
-        u'Г': u'G', u'г': u'g',
-        u'Д': u'D', u'д': u'd',
-        u'Ђ': u'Đ', u'ђ': u'đ',
-        u'Е': u'E', u'е': u'e',
-        u'Ж': u'Ž', u'ж': u'ž',
-        u'З': u'Z', u'з': u'z',
-        u'И': u'I', u'и': u'i',
-        u'Ј': u'J', u'ј': u'j',
-        u'К': u'K', u'к': u'k',
-        u'Л': u'L', u'л': u'l',
-        u'Љ': u'Lj', u'љ': u'lj',
-        u'М': u'M', u'м': u'm',
-        u'Н': u'N', u'н': u'n',
-        u'Њ': u'Nj', u'њ': u'nj',
-        u'О': u'O', u'о': u'o',
-        u'П': u'P', u'п': u'p',
-        u'Р': u'R', u'р': u'r',
-        u'С': u'S', u'с': u's',
-        u'Т': u'T', u'т': u't',
-        u'Ћ': u'Ć', u'ћ': u'ć',
-        u'У': u'U', u'у': u'u',
-        u'Ф': u'F', u'ф': u'f',
-        u'Х': u'H', u'х': u'h',
-        u'Ц': u'C', u'ц': u'c',
-        u'Ч': u'Č', u'ч': u'č',
-        u'Џ': u'Dž', u'џ': u'dž',
-        u'Ш': u'Š', u'ш': u'š',
-    }
-    # fmt: on
-    return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text)
-
-
-def norm(text):
-    return _cyr_to_latin_norm(text).lower()
-
-
-def prefix(text):
-    return _cyr_to_latin_norm(text)[0]
-
-
-def suffix(text):
-    return _cyr_to_latin_norm(text)[-3:]
-
-
-LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix}
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/sr/tokenizer_exceptions.py
+++ b/spacy/lang/sr/tokenizer_exceptions.py
@ -1,4 +1,3 @@
-from .lex_attrs import _cyr_to_latin_norm
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH, NORM
 from ...util import update_exc
@ -90,7 +89,5 @@ _slang_exc = [
 for slang_desc in _slang_exc:
    _exc[slang_desc[ORTH]] = [slang_desc]

-for _exc_key in _exc:
-    _exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM])

 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
--- a/spacy/tests/lang/sr/test_exceptions.py
+++ b/spacy/tests/lang/sr/test_exceptions.py
@ -2,15 +2,15 @@ import pytest


@pytest.mark.parametrize(
-    "text,norms",
+    "text,norms,lemmas",
    [
-        ("о.г.", ["ove godine"]),
-        ("чет.", ["četvrtak"]),
-        ("гђа", ["gospođa"]),
-        ("ил'", ["ili"]),
+        ("о.г.", ["ове године"], ["ова година"]),
+        ("чет.", ["четвртак"], ["четвртак"]),
+        ("гђа", ["госпођа"], ["госпођа"]),
+        ("ил'", ["или"], ["или"]),
    ],
 )
-def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms):
+def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
    tokens = sr_tokenizer(text)
    assert len(tokens) == 1
    assert [token.norm_ for token in tokens] == norms
--- a/spacy/tests/lang/sr/test_lex_attrs.py
+++ b/spacy/tests/lang/sr/test_lex_attrs.py
@ -1,17 +0,0 @@
-import pytest
-
-
-@pytest.mark.parametrize(
-    "text,like_num,norm,prefix,suffix",
-    [
-        ("нула", True, "nula", "n", "ula"),
-        ("Казна", False, "kazna", "K", "zna"),
-    ],
-)
-def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix):
-    tokens = sr_tokenizer(text)
-    assert len(tokens) == 1
-    assert tokens[0].like_num == like_num
-    assert tokens[0].norm_ == norm
-    assert tokens[0].prefix_ == prefix
-    assert tokens[0].suffix_ == suffix