2023-06-14 18:48:41 +03:00
|
|
|
|
from ...symbols import NORM, ORTH
|
2020-07-22 23:18:46 +03:00
|
|
|
|
from ...util import update_exc
|
2023-06-14 18:48:41 +03:00
|
|
|
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
2019-02-07 23:05:11 +03:00
|
|
|
|
|
|
|
|
|
_exc = {}
|
|
|
|
|
|
|
|
|
|
for exc_data in [
|
2022-02-01 14:24:00 +03:00
|
|
|
|
{ORTH: "обл.", NORM: "область"},
|
|
|
|
|
{ORTH: "р-н.", NORM: "район"},
|
|
|
|
|
{ORTH: "р-н", NORM: "район"},
|
|
|
|
|
{ORTH: "м.", NORM: "місто"},
|
2020-07-23 00:09:01 +03:00
|
|
|
|
{ORTH: "вул.", NORM: "вулиця"},
|
|
|
|
|
{ORTH: "просп.", NORM: "проспект"},
|
2022-02-01 14:24:00 +03:00
|
|
|
|
{ORTH: "пр-кт", NORM: "проспект"},
|
2020-07-23 00:09:01 +03:00
|
|
|
|
{ORTH: "бул.", NORM: "бульвар"},
|
|
|
|
|
{ORTH: "пров.", NORM: "провулок"},
|
|
|
|
|
{ORTH: "пл.", NORM: "площа"},
|
2022-02-01 14:24:00 +03:00
|
|
|
|
{ORTH: "майд.", NORM: "майдан"},
|
|
|
|
|
{ORTH: "мкр.", NORM: "мікрорайон"},
|
|
|
|
|
{ORTH: "ст.", NORM: "станція"},
|
|
|
|
|
{ORTH: "ж/м", NORM: "житловий масив"},
|
|
|
|
|
{ORTH: "наб.", NORM: "набережна"},
|
|
|
|
|
{ORTH: "в/ч", NORM: "військова частина"},
|
|
|
|
|
{ORTH: "в/м", NORM: "військове містечко"},
|
|
|
|
|
{ORTH: "оз.", NORM: "озеро"},
|
|
|
|
|
{ORTH: "ім.", NORM: "імені"},
|
2020-07-23 00:09:01 +03:00
|
|
|
|
{ORTH: "г.", NORM: "гора"},
|
|
|
|
|
{ORTH: "п.", NORM: "пан"},
|
|
|
|
|
{ORTH: "проф.", NORM: "професор"},
|
|
|
|
|
{ORTH: "акад.", NORM: "академік"},
|
|
|
|
|
{ORTH: "доц.", NORM: "доцент"},
|
2019-02-08 16:14:49 +03:00
|
|
|
|
]:
|
2019-02-07 23:05:11 +03:00
|
|
|
|
_exc[exc_data[ORTH]] = [exc_data]
|
|
|
|
|
|
|
|
|
|
|
2020-07-22 23:18:46 +03:00
|
|
|
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|