Update NORMs in tokenizer exceptions and related tests

This commit is contained in:
Adriane Boyd 2023-05-08 11:04:58 +02:00
parent 560ed4b491
commit 01829933fd
2 changed files with 9 additions and 6 deletions

View File

@ -1,3 +1,4 @@
from .lex_attrs import _cyr_to_latin_norm
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
@ -89,5 +90,7 @@ _slang_exc = [
for slang_desc in _slang_exc: for slang_desc in _slang_exc:
_exc[slang_desc[ORTH]] = [slang_desc] _exc[slang_desc[ORTH]] = [slang_desc]
for _exc_key in _exc:
_exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM])
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -2,15 +2,15 @@ import pytest
@pytest.mark.parametrize( @pytest.mark.parametrize(
"text,norms,lemmas", "text,norms",
[ [
("о.г.", ["ове године"], ["ова година"]), ("о.г.", ["ove godine"]),
("чет.", ["четвртак"], ["четвртак"]), ("чет.", ["četvrtak"]),
("гђа", ["госпођа"], ["госпођа"]), ("гђа", ["gospođa"]),
("ил'", ["или"], ["или"]), ("ил'", ["ili"]),
], ],
) )
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms):
tokens = sr_tokenizer(text) tokens = sr_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1
assert [token.norm_ for token in tokens] == norms assert [token.norm_ for token in tokens] == norms