Add Lower Sorbian support. (#10431)

* Add support basic support for lower sorbian. * Add some test for dsb. * Update spacy/lang/dsb/examples.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2025-10-30 23:47:31 +03:00 · 2022-03-07 16:57:14 +01:00 · 2022-03-07 16:57:14 +01:00 · 5ca0dbae76
commit 5ca0dbae76
parent 61ba5450ff
8 changed files with 182 additions and 0 deletions
--- a/spacy/lang/dsb/init.py
+++ b/spacy/lang/dsb/init.py
@ -0,0 +1,16 @@
+from .lex_attrs import LEX_ATTRS
+from .stop_words import STOP_WORDS
+from ...language import Language, BaseDefaults
+
+
+class LowerSorbianDefaults(BaseDefaults):
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+
+
+class LowerSorbian(Language):
+    lang = "dsb"
+    Defaults = LowerSorbianDefaults
+
+
+__all__ = ["LowerSorbian"]
--- a/spacy/lang/dsb/examples.py
+++ b/spacy/lang/dsb/examples.py
@ -0,0 +1,15 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.dsb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
+    "Mi so tu jara derje spodoba.",
+    "Kotre nowniny chceće měć?",
+    "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
+    "Zwóstanjo pótakem hyšći wjele źěła."
+]
--- a/spacy/lang/dsb/lex_attrs.py
+++ b/spacy/lang/dsb/lex_attrs.py
@ -0,0 +1,77 @@
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "nul",
+    "jaden", "jadna", "jadno",
+    "dwa", "dwě",
+    "tśi", "tśo",
+    "styri", "styrjo",
+    "pěś", "pěśo",
+    "šesć", "šesćo",
+    "sedym", "sedymjo",
+    "wósym", "wósymjo",
+    "źewjeś", "źewjeśo",
+    "źaseś", "źaseśo",
+    "jadnassćo",
+    "dwanassćo",
+    "tśinasćo",
+    "styrnasćo",
+    "pěśnasćo",
+    "šesnasćo",
+    "sedymnasćo",
+    "wósymnasćo",
+    "źewjeśnasćo",
+    "dwanasćo", "dwaźasća",
+    "tśiźasća",
+    "styrźasća",
+    "pěśźaset",
+    "šesćźaset",
+    "sedymźaset",
+    "wósymźaset",
+    "źewjeśźaset",
+    "sto",
+    "tysac",
+    "milion",
+    "miliarda",
+    "bilion",
+    "biliarda",
+    "trilion",
+    "triliarda",
+]
+
+_ordinal_words = [
+    "prědny", "prědna", "prědne",
+    "drugi", "druga", "druge",
+    "tśeśi", "tśeśa", "tśeśe",
+    "stwórty", "stwórta", "stwórte",
+    "pêty", "pěta", "pête",
+    "šesty", "šesta", "šeste",
+    "sedymy", "sedyma", "sedyme",
+    "wósymy", "wósyma", "wósyme",
+    "źewjety", "źewjeta", "źewjete",
+    "źasety", "źaseta", "źasete",
+    "jadnasty", "jadnasta", "jadnaste",
+    "dwanasty", "dwanasta", "dwanaste"
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/dsb/stop_words.py
+++ b/spacy/lang/dsb/stop_words.py
@ -0,0 +1,15 @@
+STOP_WORDS = set(
+    """
+a abo aby ako ale až
+
+daniž dokulaž
+
+gaž
+
+jolic
+
+pak pótom
+
+teke togodla
+""".split()
+)
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -99,6 +99,11 @@ def de_vocab():
    return get_lang_class("de")().vocab


+@pytest.fixture(scope="session")
+def dsb_tokenizer():
+    return get_lang_class("dsb")().tokenizer
+
+
@pytest.fixture(scope="session")
 def el_tokenizer():
    return get_lang_class("el")().tokenizer
--- a/spacy/tests/lang/dsb/init.py
+++ b/spacy/tests/lang/dsb/init.py
--- a/spacy/tests/lang/dsb/test_text.py
+++ b/spacy/tests/lang/dsb/test_text.py
@ -0,0 +1,25 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,match",
+    [
+        ("10", True),
+        ("1", True),
+        ("10,000", True),
+        ("10,00", True),
+        ("jadno", True),
+        ("dwanassćo", True),
+        ("milion", True),
+        ("sto", True),
+        ("ceła", False),
+        ("kopica", False),
+        ("narěcow", False),
+        (",", False),
+        ("1/2", True),
+    ],
+)
+def test_lex_attrs_like_number(dsb_tokenizer, text, match):
+    tokens = dsb_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == match
--- a/spacy/tests/lang/dsb/test_tokenizer.py
+++ b/spacy/tests/lang/dsb/test_tokenizer.py
@ -0,0 +1,29 @@
+import pytest
+
+DSB_BASIC_TOKENIZATION_TESTS = [
+    (
+        "Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
+        [
+            "Ale",
+            "eksistěrujo",
+            "mimo",
+            "togo",
+            "ceła",
+            "kopica",
+            "narěcow",
+            ",",
+            "ako",
+            "na",
+            "pśikład",
+            "slěpjańska",
+            ".",
+        ],
+    ),
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
+def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
+    tokens = dsb_tokenizer(text)
+    token_list = [token.text for token in tokens if not token.is_space]
+    assert expected_tokens == token_list