mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 04:40:20 +03:00
Update NORMs in tokenizer exceptions and related tests
This commit is contained in:
parent
560ed4b491
commit
01829933fd
|
@ -1,3 +1,4 @@
|
||||||
|
from .lex_attrs import _cyr_to_latin_norm
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH, NORM
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
@ -89,5 +90,7 @@ _slang_exc = [
|
||||||
for slang_desc in _slang_exc:
|
for slang_desc in _slang_exc:
|
||||||
_exc[slang_desc[ORTH]] = [slang_desc]
|
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||||
|
|
||||||
|
for _exc_key in _exc:
|
||||||
|
_exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM])
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -2,15 +2,15 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,norms,lemmas",
|
"text,norms",
|
||||||
[
|
[
|
||||||
("о.г.", ["ове године"], ["ова година"]),
|
("о.г.", ["ove godine"]),
|
||||||
("чет.", ["четвртак"], ["четвртак"]),
|
("чет.", ["četvrtak"]),
|
||||||
("гђа", ["госпођа"], ["госпођа"]),
|
("гђа", ["gospođa"]),
|
||||||
("ил'", ["или"], ["или"]),
|
("ил'", ["ili"]),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
|
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms):
|
||||||
tokens = sr_tokenizer(text)
|
tokens = sr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
assert [token.norm_ for token in tokens] == norms
|
assert [token.norm_ for token in tokens] == norms
|
||||||
|
|
Loading…
Reference in New Issue
Block a user