diff --git a/spacy/lang/hsb/__init__.py b/spacy/lang/hsb/__init__.py new file mode 100644 index 000000000..034d82319 --- /dev/null +++ b/spacy/lang/hsb/__init__.py @@ -0,0 +1,18 @@ +from .lex_attrs import LEX_ATTRS +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ...language import Language, BaseDefaults + + +class UpperSorbianDefaults(BaseDefaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + + +class UpperSorbian(Language): + lang = "hsb" + Defaults = UpperSorbianDefaults + + +__all__ = ["UpperSorbian"] diff --git a/spacy/lang/hsb/examples.py b/spacy/lang/hsb/examples.py new file mode 100644 index 000000000..0aafd5cee --- /dev/null +++ b/spacy/lang/hsb/examples.py @@ -0,0 +1,15 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.hsb.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin", + "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.", + "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!", + "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.", + "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej." +] diff --git a/spacy/lang/hsb/lex_attrs.py b/spacy/lang/hsb/lex_attrs.py new file mode 100644 index 000000000..dfda3e2db --- /dev/null +++ b/spacy/lang/hsb/lex_attrs.py @@ -0,0 +1,77 @@ +from ...attrs import LIKE_NUM + +_num_words = [ + "nul", + "jedyn", "jedna", "jedne", + "dwaj", "dwě", + "tři", "třo", + "štyri", "štyrjo", + "pjeć", + "šěsć", + "sydom", + "wosom", + "dźewjeć", + "dźesać", + "jědnaće", + "dwanaće", + "třinaće", + "štyrnaće", + "pjatnaće", + "šěsnaće", + "sydomnaće", + "wosomnaće", + "dźewjatnaće", + "dwaceći" + "třiceći", + "štyrceći", + "pjećdźesat", + "šěsćdźesat", + "sydomdźesat", + "wosomdźesat", + "dźewjećdźesat", + "sto", + "tysac", + "milion", + "miliarda", + "bilion", + "biliarda", + "trilion", + "triliarda", +] + +_ordinal_words = [ + "prěni", "prěnja", "prěnje", + "druhi", "druha", "druhe", + "třeći", "třeća", "třeće", + "štwórty", "štwórta", "štwórte", + "pjaty", "pjata", "pjate", + "šěsty", "šěsta", "šěste", + "sydmy", "sydma", "sydme", + "wosmy", "wosma", "wosme", + "dźewjaty", "dźewjata", "dźewjate", + "dźesaty", "dźesata", "dźesate", + "jědnaty", "jědnata", "jědnate", + "dwanaty", "dwanata", "dwanate" +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + if text_lower in _num_words: + return True + # Check ordinal number + if text_lower in _ordinal_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/hsb/stop_words.py b/spacy/lang/hsb/stop_words.py new file mode 100644 index 000000000..e6fedaf4c --- /dev/null +++ b/spacy/lang/hsb/stop_words.py @@ -0,0 +1,19 @@ +STOP_WORDS = set( + """ +a abo ale ani + +dokelž + +hdyž + +jeli jelizo + +kaž + +pak potom + +tež tohodla + +zo zoby +""".split() +) diff --git a/spacy/lang/hsb/tokenizer_exceptions.py b/spacy/lang/hsb/tokenizer_exceptions.py new file mode 100644 index 000000000..4b9a4f98a --- /dev/null +++ b/spacy/lang/hsb/tokenizer_exceptions.py @@ -0,0 +1,18 @@ +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...symbols import ORTH, NORM +from ...util import update_exc + +_exc = dict() +for exc_data in [ + {ORTH: "mil.", NORM: "milion"}, + {ORTH: "wob.", NORM: "wobydler"}, +]: + _exc[exc_data[ORTH]] = [exc_data] + +for orth in [ + "resp.", +]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index f9266cb94..7083fd817 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -221,6 +221,11 @@ def ja_tokenizer(): return get_lang_class("ja")().tokenizer +@pytest.fixture(scope="session") +def hsb_tokenizer(): + return get_lang_class("hsb")().tokenizer + + @pytest.fixture(scope="session") def ko_tokenizer(): pytest.importorskip("natto") diff --git a/spacy/tests/lang/hsb/__init__.py b/spacy/tests/lang/hsb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/hsb/test_text.py b/spacy/tests/lang/hsb/test_text.py new file mode 100644 index 000000000..aaa4984eb --- /dev/null +++ b/spacy/tests/lang/hsb/test_text.py @@ -0,0 +1,25 @@ +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("jedne", True), + ("dwanaće", True), + ("milion", True), + ("sto", True), + ("załožene", False), + ("wona", False), + ("powšitkownej", False), + (",", False), + ("1/2", True), + ], +) +def test_lex_attrs_like_number(hsb_tokenizer, text, match): + tokens = hsb_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/hsb/test_tokenizer.py b/spacy/tests/lang/hsb/test_tokenizer.py new file mode 100644 index 000000000..a3ec89ba0 --- /dev/null +++ b/spacy/tests/lang/hsb/test_tokenizer.py @@ -0,0 +1,32 @@ +import pytest + +HSB_BASIC_TOKENIZATION_TESTS = [ + ( + "Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.", + [ + "Hornjoserbšćina", + "wobsteji", + "resp.", + "wobsteješe", + "z", + "wjacorych", + "dialektow", + ",", + "kotrež", + "so", + "zdźěla", + "chětro", + "wot", + "so", + "rozeznawachu", + ".", + ], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS) +def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens): + tokens = hsb_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list