From 60e10a9f935e51c05959f7c3d3cd86f50a99aa9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavle=20Vidanovi=C4=87?= Date: Thu, 22 Aug 2019 11:43:07 +0200 Subject: [PATCH] Serbian language improvement (#4169) * Serbian stopwords added. (cyrillic alphabet) * spaCy Contribution agreement included. * Test initialize updated * Serbian language code update. --bugfix * Tokenizer exceptions added. Init file updated. * Norm exceptions and lexical attributes added. * Examples added. * Tests added. * sr_lang examples update. * Tokenizer exceptions updated. (Serbian) --- spacy/lang/sr/__init__.py | 9 +- spacy/lang/sr/examples.py | 25 +++++ spacy/lang/sr/lex_attrs.py | 69 +++++++++++++ spacy/lang/sr/norm_exceptions.py | 26 +++++ spacy/lang/sr/tokenizer_exceptions.py | 95 ++++++++++++++++++ spacy/tests/conftest.py | 5 + spacy/tests/lang/sr/__init__.py | 0 spacy/tests/lang/sr/test_tokenizer.py | 128 +++++++++++++++++++++++++ spacy/tests/lang/sr/test_еxceptions.py | 14 +++ 9 files changed, 368 insertions(+), 3 deletions(-) create mode 100644 spacy/lang/sr/examples.py create mode 100644 spacy/lang/sr/lex_attrs.py create mode 100644 spacy/lang/sr/norm_exceptions.py create mode 100755 spacy/lang/sr/tokenizer_exceptions.py create mode 100644 spacy/tests/lang/sr/__init__.py create mode 100644 spacy/tests/lang/sr/test_tokenizer.py create mode 100644 spacy/tests/lang/sr/test_еxceptions.py diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index 99090be3b..f27b87102 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals from .stop_words import STOP_WORDS - +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .norm_exceptions import NORM_EXCEPTIONS +from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS from ...language import Language @@ -12,11 +14,12 @@ from ...util import update_exc, add_lookups class SerbianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "sr" lex_attr_getters[NORM] = add_lookups( - Language.Defaults.lex_attr_getters[NORM], BASE_NORMS + Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS ) - tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS diff --git a/spacy/lang/sr/examples.py b/spacy/lang/sr/examples.py new file mode 100644 index 000000000..fb75f716d --- /dev/null +++ b/spacy/lang/sr/examples.py @@ -0,0 +1,25 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.sr.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + # Translations from English + "Apple планира куповину америчког стартапа за $1 милијарду." + "Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.", + "Лондон је велики град у Уједињеном Краљевству.", + "Где си ти?", + "Ко је председник Француске?", + # Serbian common and slang + "Moj ћале је инжењер!", + "Новак Ђоковић је најбољи тенисер света." + "У Пироту има добрих кафана!", + "Музеј Николе Тесле се налази у Београду." +] diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py new file mode 100644 index 000000000..8909b7958 --- /dev/null +++ b/spacy/lang/sr/lex_attrs.py @@ -0,0 +1,69 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + "нула", + "један", + "два", + "три", + "четири", + "пет", + "шест", + "седам", + "осам", + "девет", + "десет", + "једанаест", + "дванаест", + "тринаест", + "четрнаест", + "петнаест", + "шеснаест", + "седамнаест", + "осамнаест", + "деветнаест", + "двадесет", + "тридесет", + "четрдесет", + "педесет", + "шездесет", + "седамдесет", + "осамдесет", + "деведесет", + "сто", + "двеста", + "триста", + "четиристо", + "петсто", + "шестсто", + "седамсто", + "осамсто", + "деветсто", + "хиљаду", + "милион", + "милијарду", + "трилион", + "квадрилион", + "квинтилион" +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py new file mode 100644 index 000000000..25db66532 --- /dev/null +++ b/spacy/lang/sr/norm_exceptions.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + + +_exc = { + # Slang + "ћале": "отац", + "кева": "мајка", + "смор": "досада", + "кец": "јединица", + "тебра": "брат", + "штребер": "ученик", + "факс": "факултет", + "профа": "професор", + "бус": "аутобус", + "пискарало": "службеник", + "бакутанер": "бака", + "џибер": "простак" +} + + +NORM_EXCEPTIONS = {} + +for string, norm in _exc.items(): + NORM_EXCEPTIONS[string] = norm + NORM_EXCEPTIONS[string.title()] = norm diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py new file mode 100755 index 000000000..8fdfd91b8 --- /dev/null +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -0,0 +1,95 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, LEMMA, NORM + + +_exc = {} + +_abbrev_exc = [ + # Weekdays abbreviations + {ORTH: "пoн", LEMMA: "понедељак", NORM: "понедељак"}, + {ORTH: "уто", LEMMA: "уторак", NORM: "уторак"}, + {ORTH: "сре", LEMMA: "среда", NORM: "среда"}, + {ORTH: "чет", LEMMA: "четвртак", NORM: "четвртак"}, + {ORTH: "пет", LEMMA: "петак", NORM: "петак"}, + {ORTH: "суб", LEMMA: "субота", NORM: "субота"}, + {ORTH: "нед", LEMMA: "недеља", NORM: "недеља"}, + + # Months abbreviations + {ORTH: "јан", LEMMA: "јануар", NORM: "јануар"}, + {ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"}, + {ORTH: "мар", LEMMA: "март", NORM: "март"}, + {ORTH: "апр", LEMMA: "април", NORM: "април"}, + {ORTH: "јуни", LEMMA: "јун", NORM: "јун"}, + {ORTH: "јули", LEMMA: "јул", NORM: "јул"}, + {ORTH: "авг", LEMMA: "август", NORM: "август"}, + {ORTH: "сеп", LEMMA: "септембар", NORM: "септембар"}, + {ORTH: "септ", LEMMA: "септембар", NORM: "септембар"}, + {ORTH: "окт", LEMMA: "октобар", NORM: "октобар"}, + {ORTH: "нов", LEMMA: "новембар", NORM: "новембар"}, + {ORTH: "дец", LEMMA: "децембар", NORM: "децембар"} +] + + +for abbrev_desc in _abbrev_exc: + abbrev = abbrev_desc[ORTH] + for orth in (abbrev, abbrev.capitalize(), abbrev.upper()): + _exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] + _exc[orth + '.'] = [{ORTH: orth + '.', LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}] + + +# common abbreviations +_slang_exc = [ + # without dot + {ORTH: 'др', LEMMA: 'доктор', NORM: 'доктор'}, + {ORTH: 'гдин', LEMMA: 'господин', NORM: 'господин'}, + {ORTH: 'гђа', LEMMA: 'госпођа', NORM: 'госпођа'}, + {ORTH: 'гђица', LEMMA: 'госпођица', NORM: 'госпођица'}, + {ORTH: 'мр', LEMMA: 'магистар', NORM: 'магистар'}, + {ORTH: 'Бгд', LEMMA: 'Београд', NORM: 'београд'}, + {ORTH: 'цм', LEMMA: 'центиметар', NORM: 'центиметар'}, + {ORTH: 'м', LEMMA: 'метар', NORM: 'метар'}, + {ORTH: 'км', LEMMA: 'километар', NORM: 'километар'}, + {ORTH: 'мг', LEMMA: 'милиграм', NORM: 'милиграм'}, + {ORTH: 'кг', LEMMA: 'килограм', NORM: 'килограм'}, + {ORTH: 'дл', LEMMA: 'децилитар', NORM: 'децилитар'}, + {ORTH: 'хл', LEMMA: 'хектолитар', NORM: 'хектолитар'}, + # with dot + {ORTH: 'ул.', LEMMA: 'улица', NORM: 'улица'}, + {ORTH: 'бр.', LEMMA: 'број', NORM: 'број'}, + {ORTH: 'нпр.', LEMMA: 'на пример', NORM: 'на пример'}, + {ORTH: 'тзв.', LEMMA: 'такозван', NORM: 'такозван'}, + {ORTH: 'проф.', LEMMA: 'професор', NORM: 'професор'}, + {ORTH: 'стр.', LEMMA: 'страна', NORM: 'страна'}, + {ORTH: 'једн.', LEMMA: 'једнина', NORM: 'једнина'}, + {ORTH: 'мн.', LEMMA: 'множина', NORM: 'множина'}, + {ORTH: 'уч.', LEMMA: 'ученик', NORM: 'ученик'}, + {ORTH: 'разр.', LEMMA: 'разред', NORM: 'разред'}, + {ORTH: 'инж.', LEMMA: 'инжењер', NORM: 'инжењер'}, + {ORTH: 'гимн.', LEMMA: 'гимназија', NORM: 'гимназија'}, + {ORTH: 'год.', LEMMA: 'година', NORM: 'година'}, + {ORTH: 'мед.', LEMMA: 'медицина', NORM: 'медицина'}, + {ORTH: 'гимн.', LEMMA: 'гимназија', NORM: 'гимназија'}, + {ORTH: "акад.", LEMMA: "академик", NORM: "академик"}, + {ORTH: "доц.", LEMMA: "доцент", NORM: "доцент"}, + {ORTH: "итд.", LEMMA: "и тако даље", NORM: "и тако даље"}, + {ORTH: "и сл.", LEMMA: "и слично", NORM: "и слично"}, + {ORTH: "н.е.", LEMMA: "нова ера", NORM: "нове ере"}, + {ORTH: "о.г.", LEMMA: "ова година", NORM: "ове године"}, + {ORTH: "л.к.", LEMMA: "лична карта", NORM: "лична карта"}, + {ORTH: "в.д.", LEMMA: "вршилац дужности", NORM: "вршилац дужности"}, + {ORTH: "стр.", LEMMA: "страна", NORM: "страна"}, + # with qoute + {ORTH: "ал'", LEMMA: "али", NORM: "али"}, + {ORTH: "ил'", LEMMA: "или", NORM: "или"}, + {ORTH: "је л'", LEMMA: "је ли", NORM: "је ли"}, + {ORTH: "да л'", LEMMA: "да ли", NORM: "да ли"}, + {ORTH: "држ'те", LEMMA: "држати", NORM: "држите"} +] + +for slang_desc in _slang_exc: + _exc[slang_desc[ORTH]] = [slang_desc] + + +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index fdd86616d..eeb2b2d6f 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -176,6 +176,11 @@ def ru_tokenizer(): return get_lang_class("ru").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def sr_tokenizer(): + return get_lang_class("sr").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def sv_tokenizer(): return get_lang_class("sv").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/sr/__init__.py b/spacy/tests/lang/sr/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/sr/test_tokenizer.py b/spacy/tests/lang/sr/test_tokenizer.py new file mode 100644 index 000000000..c4672b3ef --- /dev/null +++ b/spacy/tests/lang/sr/test_tokenizer.py @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +PUNCT_OPEN = ["(", "[", "{", "*"] +PUNCT_CLOSE = [")", "]", "}", "*"] +PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")] + + +@pytest.mark.parametrize("text", ["(", "((", "<"]) +def test_sr_tokenizer_handles_only_punct(sr_tokenizer, text): + tokens = sr_tokenizer(text) + assert len(tokens) == len(text) + + +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("text", ["Здраво"]) +def test_sr_tokenizer_splits_open_punct(sr_tokenizer, punct, text): + tokens = sr_tokenizer(punct + text) + assert len(tokens) == 2 + assert tokens[0].text == punct + assert tokens[1].text == text + + +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("text", ["Здраво"]) +def test_sr_tokenizer_splits_close_punct(sr_tokenizer, punct, text): + tokens = sr_tokenizer(text + punct) + assert len(tokens) == 2 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("punct_add", ["`"]) +@pytest.mark.parametrize("text", ["Ћао"]) +def test_sr_tokenizer_splits_two_diff_open_punct(sr_tokenizer, punct, punct_add, text): + tokens = sr_tokenizer(punct + punct_add + text) + assert len(tokens) == 3 + assert tokens[0].text == punct + assert tokens[1].text == punct_add + assert tokens[2].text == text + + +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("punct_add", ["'"]) +@pytest.mark.parametrize("text", ["Здраво"]) +def test_sr_tokenizer_splits_two_diff_close_punct(sr_tokenizer, punct, punct_add, text): + tokens = sr_tokenizer(text + punct + punct_add) + assert len(tokens) == 3 + assert tokens[0].text == text + assert tokens[1].text == punct + assert tokens[2].text == punct_add + + +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("text", ["Здраво"]) +def test_sr_tokenizer_splits_same_open_punct(sr_tokenizer, punct, text): + tokens = sr_tokenizer(punct + punct + punct + text) + assert len(tokens) == 4 + assert tokens[0].text == punct + assert tokens[3].text == text + + +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("text", ["Здраво"]) +def test_sr_tokenizer_splits_same_close_punct(sr_tokenizer, punct, text): + tokens = sr_tokenizer(text + punct + punct + punct) + assert len(tokens) == 4 + assert tokens[0].text == text + assert tokens[1].text == punct + + +@pytest.mark.parametrize("text", ["'Тест"]) +def test_sr_tokenizer_splits_open_appostrophe(sr_tokenizer, text): + tokens = sr_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == "'" + + +@pytest.mark.xfail +@pytest.mark.parametrize("text", ["Тест''"]) +def test_sr_tokenizer_splits_double_end_quote(sr_tokenizer, text): + tokens = sr_tokenizer(text) + assert len(tokens) == 2 + tokens_punct = sr_tokenizer("''") + assert len(tokens_punct) == 1 + + +@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) +@pytest.mark.parametrize("text", ["Тест"]) +def test_sr_tokenizer_splits_open_close_punct( + sr_tokenizer, punct_open, punct_close, text +): + tokens = sr_tokenizer(punct_open + text + punct_close) + assert len(tokens) == 3 + assert tokens[0].text == punct_open + assert tokens[1].text == text + assert tokens[2].text == punct_close + + +@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) +@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")]) +@pytest.mark.parametrize("text", ["Тест"]) +def test_sr_tokenizer_two_diff_punct( + sr_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text +): + tokens = sr_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) + assert len(tokens) == 5 + assert tokens[0].text == punct_open2 + assert tokens[1].text == punct_open + assert tokens[2].text == text + assert tokens[3].text == punct_close + assert tokens[4].text == punct_close2 + + +@pytest.mark.parametrize("text", ["Тест."]) +def test_sr_tokenizer_splits_trailing_dot(sr_tokenizer, text): + tokens = sr_tokenizer(text) + assert tokens[1].text == "." + + +def test_sr_tokenizer_splits_bracket_period(sr_tokenizer): + text = "(Један, два, три, четири, проба)." + tokens = sr_tokenizer(text) + assert tokens[len(tokens) - 1].text == "." diff --git a/spacy/tests/lang/sr/test_еxceptions.py b/spacy/tests/lang/sr/test_еxceptions.py new file mode 100644 index 000000000..136c995ab --- /dev/null +++ b/spacy/tests/lang/sr/test_еxceptions.py @@ -0,0 +1,14 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "text,norms,lemmas", + [("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]), + ("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])]) +def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): + tokens = sr_tokenizer(text) + assert len(tokens) == 1 + assert [token.norm_ for token in tokens] == norms