Serbian language improvement (#4169)

* Serbian stopwords added. (cyrillic alphabet)

* spaCy Contribution agreement included.

* Test initialize updated

* Serbian language code update. --bugfix

* Tokenizer exceptions added. Init file updated.

* Norm exceptions and lexical attributes added.

* Examples added.

* Tests added.

* sr_lang examples update.

* Tokenizer exceptions updated. (Serbian)
This commit is contained in:
Pavle Vidanović 2019-08-22 11:43:07 +02:00 committed by Ines Montani
parent de272f8b82
commit 60e10a9f93
9 changed files with 368 additions and 3 deletions

View File

@ -2,7 +2,9 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
from ...language import Language from ...language import Language
@ -12,11 +14,12 @@ from ...util import update_exc, add_lookups
class SerbianDefaults(Language.Defaults): class SerbianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "sr" lex_attr_getters[LANG] = lambda text: "sr"
lex_attr_getters[NORM] = add_lookups( lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS Language.Defaults.lex_attr_getters[NORM], BASE_NORMS, NORM_EXCEPTIONS
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS

25
spacy/lang/sr/examples.py Normal file
View File

@ -0,0 +1,25 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.sr.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
# Translations from English
"Apple планира куповину америчког стартапа за $1 милијарду."
"Беспилотни аутомобили пребацују одговорност осигурања на произвођаче.",
"Лондон је велики град у Уједињеном Краљевству.",
"Где си ти?",
"Ко је председник Француске?",
# Serbian common and slang
"Moj ћале је инжењер!",
"Новак Ђоковић је најбољи тенисер света."
"У Пироту има добрих кафана!",
"Музеј Николе Тесле се налази у Београду."
]

View File

@ -0,0 +1,69 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = [
"нула",
"један",
"два",
"три",
"четири",
"пет",
"шест",
"седам",
"осам",
"девет",
"десет",
"једанаест",
"дванаест",
"тринаест",
"четрнаест",
"петнаест",
"шеснаест",
"седамнаест",
"осамнаест",
"деветнаест",
"двадесет",
"тридесет",
"четрдесет",
"педесет",
"шездесет",
"седамдесет",
"осамдесет",
"деведесет",
"сто",
"двеста",
"триста",
"четиристо",
"петсто",
"шестсто",
"седамсто",
"осамсто",
"деветсто",
"хиљаду",
"милион",
"милијарду",
"трилион",
"квадрилион",
"квинтилион"
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text.lower() in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
_exc = {
# Slang
"ћале": "отац",
"кева": "мајка",
"смор": "досада",
"кец": "јединица",
"тебра": "брат",
"штребер": "ученик",
"факс": "факултет",
"профа": "професор",
"бус": "аутобус",
"пискарало": "службеник",
"бакутанер": "бака",
"џибер": "простак"
}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -0,0 +1,95 @@
# encoding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, NORM
_exc = {}
_abbrev_exc = [
# Weekdays abbreviations
{ORTH: "пoн", LEMMA: "понедељак", NORM: "понедељак"},
{ORTH: "уто", LEMMA: "уторак", NORM: "уторак"},
{ORTH: "сре", LEMMA: "среда", NORM: "среда"},
{ORTH: "чет", LEMMA: "четвртак", NORM: "четвртак"},
{ORTH: "пет", LEMMA: "петак", NORM: "петак"},
{ORTH: "суб", LEMMA: "субота", NORM: "субота"},
{ORTH: "нед", LEMMA: "недеља", NORM: "недеља"},
# Months abbreviations
{ORTH: "јан", LEMMA: "јануар", NORM: "јануар"},
{ORTH: "феб", LEMMA: "фебруар", NORM: "фебруар"},
{ORTH: "мар", LEMMA: "март", NORM: "март"},
{ORTH: "апр", LEMMA: "април", NORM: "април"},
{ORTH: "јуни", LEMMA: "јун", NORM: "јун"},
{ORTH: "јули", LEMMA: "јул", NORM: "јул"},
{ORTH: "авг", LEMMA: "август", NORM: "август"},
{ORTH: "сеп", LEMMA: "септембар", NORM: "септембар"},
{ORTH: "септ", LEMMA: "септембар", NORM: "септембар"},
{ORTH: "окт", LEMMA: "октобар", NORM: "октобар"},
{ORTH: "нов", LEMMA: "новембар", NORM: "новембар"},
{ORTH: "дец", LEMMA: "децембар", NORM: "децембар"}
]
for abbrev_desc in _abbrev_exc:
abbrev = abbrev_desc[ORTH]
for orth in (abbrev, abbrev.capitalize(), abbrev.upper()):
_exc[orth] = [{ORTH: orth, LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
_exc[orth + '.'] = [{ORTH: orth + '.', LEMMA: abbrev_desc[LEMMA], NORM: abbrev_desc[NORM]}]
# common abbreviations
_slang_exc = [
# without dot
{ORTH: 'др', LEMMA: 'доктор', NORM: 'доктор'},
{ORTH: 'гдин', LEMMA: 'господин', NORM: 'господин'},
{ORTH: 'гђа', LEMMA: 'госпођа', NORM: 'госпођа'},
{ORTH: 'гђица', LEMMA: 'госпођица', NORM: 'госпођица'},
{ORTH: 'мр', LEMMA: 'магистар', NORM: 'магистар'},
{ORTH: 'Бгд', LEMMA: 'Београд', NORM: 'београд'},
{ORTH: 'цм', LEMMA: 'центиметар', NORM: 'центиметар'},
{ORTH: 'м', LEMMA: 'метар', NORM: 'метар'},
{ORTH: 'км', LEMMA: 'километар', NORM: 'километар'},
{ORTH: 'мг', LEMMA: 'милиграм', NORM: 'милиграм'},
{ORTH: 'кг', LEMMA: 'килограм', NORM: 'килограм'},
{ORTH: 'дл', LEMMA: 'децилитар', NORM: 'децилитар'},
{ORTH: 'хл', LEMMA: 'хектолитар', NORM: 'хектолитар'},
# with dot
{ORTH: 'ул.', LEMMA: 'улица', NORM: 'улица'},
{ORTH: 'бр.', LEMMA: 'број', NORM: 'број'},
{ORTH: 'нпр.', LEMMA: 'на пример', NORM: 'на пример'},
{ORTH: 'тзв.', LEMMA: 'такозван', NORM: 'такозван'},
{ORTH: 'проф.', LEMMA: 'професор', NORM: 'професор'},
{ORTH: 'стр.', LEMMA: 'страна', NORM: 'страна'},
{ORTH: 'једн.', LEMMA: 'једнина', NORM: 'једнина'},
{ORTH: 'мн.', LEMMA: 'множина', NORM: 'множина'},
{ORTH: 'уч.', LEMMA: 'ученик', NORM: 'ученик'},
{ORTH: 'разр.', LEMMA: 'разред', NORM: 'разред'},
{ORTH: 'инж.', LEMMA: 'инжењер', NORM: 'инжењер'},
{ORTH: 'гимн.', LEMMA: 'гимназија', NORM: 'гимназија'},
{ORTH: 'год.', LEMMA: 'година', NORM: 'година'},
{ORTH: 'мед.', LEMMA: 'медицина', NORM: 'медицина'},
{ORTH: 'гимн.', LEMMA: 'гимназија', NORM: 'гимназија'},
{ORTH: "акад.", LEMMA: "академик", NORM: "академик"},
{ORTH: "доц.", LEMMA: "доцент", NORM: "доцент"},
{ORTH: "итд.", LEMMA: "и тако даље", NORM: "и тако даље"},
{ORTH: "и сл.", LEMMA: "и слично", NORM: "и слично"},
{ORTH: "н.е.", LEMMA: "нова ера", NORM: "нове ере"},
{ORTH: "о.г.", LEMMA: "ова година", NORM: "ове године"},
{ORTH: "л.к.", LEMMA: "лична карта", NORM: "лична карта"},
{ORTH: "в.д.", LEMMA: "вршилац дужности", NORM: "вршилац дужности"},
{ORTH: "стр.", LEMMA: "страна", NORM: "страна"},
# with qoute
{ORTH: "ал'", LEMMA: "али", NORM: "али"},
{ORTH: "ил'", LEMMA: "или", NORM: "или"},
{ORTH: "је л'", LEMMA: "је ли", NORM: "је ли"},
{ORTH: "да л'", LEMMA: "да ли", NORM: "да ли"},
{ORTH: "држ'те", LEMMA: "држати", NORM: "држите"}
]
for slang_desc in _slang_exc:
_exc[slang_desc[ORTH]] = [slang_desc]
TOKENIZER_EXCEPTIONS = _exc

View File

@ -176,6 +176,11 @@ def ru_tokenizer():
return get_lang_class("ru").Defaults.create_tokenizer() return get_lang_class("ru").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def sr_tokenizer():
return get_lang_class("sr").Defaults.create_tokenizer()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def sv_tokenizer(): def sv_tokenizer():
return get_lang_class("sv").Defaults.create_tokenizer() return get_lang_class("sv").Defaults.create_tokenizer()

View File

View File

@ -0,0 +1,128 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
PUNCT_OPEN = ["(", "[", "{", "*"]
PUNCT_CLOSE = [")", "]", "}", "*"]
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
@pytest.mark.parametrize("text", ["(", "((", "<"])
def test_sr_tokenizer_handles_only_punct(sr_tokenizer, text):
tokens = sr_tokenizer(text)
assert len(tokens) == len(text)
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Здраво"])
def test_sr_tokenizer_splits_open_punct(sr_tokenizer, punct, text):
tokens = sr_tokenizer(punct + text)
assert len(tokens) == 2
assert tokens[0].text == punct
assert tokens[1].text == text
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Здраво"])
def test_sr_tokenizer_splits_close_punct(sr_tokenizer, punct, text):
tokens = sr_tokenizer(text + punct)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("punct_add", ["`"])
@pytest.mark.parametrize("text", ["Ћао"])
def test_sr_tokenizer_splits_two_diff_open_punct(sr_tokenizer, punct, punct_add, text):
tokens = sr_tokenizer(punct + punct_add + text)
assert len(tokens) == 3
assert tokens[0].text == punct
assert tokens[1].text == punct_add
assert tokens[2].text == text
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("punct_add", ["'"])
@pytest.mark.parametrize("text", ["Здраво"])
def test_sr_tokenizer_splits_two_diff_close_punct(sr_tokenizer, punct, punct_add, text):
tokens = sr_tokenizer(text + punct + punct_add)
assert len(tokens) == 3
assert tokens[0].text == text
assert tokens[1].text == punct
assert tokens[2].text == punct_add
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Здраво"])
def test_sr_tokenizer_splits_same_open_punct(sr_tokenizer, punct, text):
tokens = sr_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4
assert tokens[0].text == punct
assert tokens[3].text == text
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Здраво"])
def test_sr_tokenizer_splits_same_close_punct(sr_tokenizer, punct, text):
tokens = sr_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4
assert tokens[0].text == text
assert tokens[1].text == punct
@pytest.mark.parametrize("text", ["'Тест"])
def test_sr_tokenizer_splits_open_appostrophe(sr_tokenizer, text):
tokens = sr_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == "'"
@pytest.mark.xfail
@pytest.mark.parametrize("text", ["Тест''"])
def test_sr_tokenizer_splits_double_end_quote(sr_tokenizer, text):
tokens = sr_tokenizer(text)
assert len(tokens) == 2
tokens_punct = sr_tokenizer("''")
assert len(tokens_punct) == 1
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("text", ["Тест"])
def test_sr_tokenizer_splits_open_close_punct(
sr_tokenizer, punct_open, punct_close, text
):
tokens = sr_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3
assert tokens[0].text == punct_open
assert tokens[1].text == text
assert tokens[2].text == punct_close
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
@pytest.mark.parametrize("text", ["Тест"])
def test_sr_tokenizer_two_diff_punct(
sr_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
):
tokens = sr_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5
assert tokens[0].text == punct_open2
assert tokens[1].text == punct_open
assert tokens[2].text == text
assert tokens[3].text == punct_close
assert tokens[4].text == punct_close2
@pytest.mark.parametrize("text", ["Тест."])
def test_sr_tokenizer_splits_trailing_dot(sr_tokenizer, text):
tokens = sr_tokenizer(text)
assert tokens[1].text == "."
def test_sr_tokenizer_splits_bracket_period(sr_tokenizer):
text = "(Један, два, три, четири, проба)."
tokens = sr_tokenizer(text)
assert tokens[len(tokens) - 1].text == "."

View File

@ -0,0 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"text,norms,lemmas",
[("о.г.", ["ове године"], ["ова година"]), ("чет.", ["четвртак"], ["четвртак"]),
("гђа", ["госпођа"], ["госпођа"]), ("ил'", ["или"], ["или"])])
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
tokens = sr_tokenizer(text)
assert len(tokens) == 1
assert [token.norm_ for token in tokens] == norms