mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 20:30:24 +03:00
Update NORMs in tokenizer exceptions and related tests
This commit is contained in:
parent
560ed4b491
commit
01829933fd
|
@ -1,3 +1,4 @@
|
|||
from .lex_attrs import _cyr_to_latin_norm
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
@ -89,5 +90,7 @@ _slang_exc = [
|
|||
for slang_desc in _slang_exc:
|
||||
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||
|
||||
for _exc_key in _exc:
|
||||
_exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM])
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
|
@ -2,15 +2,15 @@ import pytest
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,norms,lemmas",
|
||||
"text,norms",
|
||||
[
|
||||
("о.г.", ["ове године"], ["ова година"]),
|
||||
("чет.", ["четвртак"], ["четвртак"]),
|
||||
("гђа", ["госпођа"], ["госпођа"]),
|
||||
("ил'", ["или"], ["или"]),
|
||||
("о.г.", ["ove godine"]),
|
||||
("чет.", ["četvrtak"]),
|
||||
("гђа", ["gospođa"]),
|
||||
("ил'", ["ili"]),
|
||||
],
|
||||
)
|
||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
|
||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms):
|
||||
tokens = sr_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert [token.norm_ for token in tokens] == norms
|
||||
|
|
Loading…
Reference in New Issue
Block a user