diff --git a/spacy/lang/dsb/__init__.py b/spacy/lang/dsb/__init__.py new file mode 100644 index 000000000..c66092a0c --- /dev/null +++ b/spacy/lang/dsb/__init__.py @@ -0,0 +1,16 @@ +from .lex_attrs import LEX_ATTRS +from .stop_words import STOP_WORDS +from ...language import Language, BaseDefaults + + +class LowerSorbianDefaults(BaseDefaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + +class LowerSorbian(Language): + lang = "dsb" + Defaults = LowerSorbianDefaults + + +__all__ = ["LowerSorbian"] diff --git a/spacy/lang/dsb/examples.py b/spacy/lang/dsb/examples.py new file mode 100644 index 000000000..28b8c41f1 --- /dev/null +++ b/spacy/lang/dsb/examples.py @@ -0,0 +1,15 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.dsb.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.", + "Mi so tu jara derje spodoba.", + "Kotre nowniny chceće měć?", + "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.", + "Zwóstanjo pótakem hyšći wjele źěła." +] diff --git a/spacy/lang/dsb/lex_attrs.py b/spacy/lang/dsb/lex_attrs.py new file mode 100644 index 000000000..75fb2e590 --- /dev/null +++ b/spacy/lang/dsb/lex_attrs.py @@ -0,0 +1,77 @@ +from ...attrs import LIKE_NUM + +_num_words = [ + "nul", + "jaden", "jadna", "jadno", + "dwa", "dwě", + "tśi", "tśo", + "styri", "styrjo", + "pěś", "pěśo", + "šesć", "šesćo", + "sedym", "sedymjo", + "wósym", "wósymjo", + "źewjeś", "źewjeśo", + "źaseś", "źaseśo", + "jadnassćo", + "dwanassćo", + "tśinasćo", + "styrnasćo", + "pěśnasćo", + "šesnasćo", + "sedymnasćo", + "wósymnasćo", + "źewjeśnasćo", + "dwanasćo", "dwaźasća", + "tśiźasća", + "styrźasća", + "pěśźaset", + "šesćźaset", + "sedymźaset", + "wósymźaset", + "źewjeśźaset", + "sto", + "tysac", + "milion", + "miliarda", + "bilion", + "biliarda", + "trilion", + "triliarda", +] + +_ordinal_words = [ + "prědny", "prědna", "prědne", + "drugi", "druga", "druge", + "tśeśi", "tśeśa", "tśeśe", + "stwórty", "stwórta", "stwórte", + "pêty", "pěta", "pête", + "šesty", "šesta", "šeste", + "sedymy", "sedyma", "sedyme", + "wósymy", "wósyma", "wósyme", + "źewjety", "źewjeta", "źewjete", + "źasety", "źaseta", "źasete", + "jadnasty", "jadnasta", "jadnaste", + "dwanasty", "dwanasta", "dwanaste" +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + if text_lower in _num_words: + return True + # Check ordinal number + if text_lower in _ordinal_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/dsb/stop_words.py b/spacy/lang/dsb/stop_words.py new file mode 100644 index 000000000..376e04aa6 --- /dev/null +++ b/spacy/lang/dsb/stop_words.py @@ -0,0 +1,15 @@ +STOP_WORDS = set( + """ +a abo aby ako ale až + +daniž dokulaž + +gaž + +jolic + +pak pótom + +teke togodla +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 7083fd817..24474c71e 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -99,6 +99,11 @@ def de_vocab(): return get_lang_class("de")().vocab +@pytest.fixture(scope="session") +def dsb_tokenizer(): + return get_lang_class("dsb")().tokenizer + + @pytest.fixture(scope="session") def el_tokenizer(): return get_lang_class("el")().tokenizer diff --git a/spacy/tests/lang/dsb/__init__.py b/spacy/tests/lang/dsb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/dsb/test_text.py b/spacy/tests/lang/dsb/test_text.py new file mode 100644 index 000000000..40f2c15e0 --- /dev/null +++ b/spacy/tests/lang/dsb/test_text.py @@ -0,0 +1,25 @@ +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("jadno", True), + ("dwanassćo", True), + ("milion", True), + ("sto", True), + ("ceła", False), + ("kopica", False), + ("narěcow", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(dsb_tokenizer, text, match): + tokens = dsb_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/dsb/test_tokenizer.py b/spacy/tests/lang/dsb/test_tokenizer.py new file mode 100644 index 000000000..135974fb8 --- /dev/null +++ b/spacy/tests/lang/dsb/test_tokenizer.py @@ -0,0 +1,29 @@ +import pytest + +DSB_BASIC_TOKENIZATION_TESTS = [ + ( + "Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.", + [ + "Ale", + "eksistěrujo", + "mimo", + "togo", + "ceła", + "kopica", + "narěcow", + ",", + "ako", + "na", + "pśikład", + "slěpjańska", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS) +def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens): + tokens = dsb_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list