Add Lower Sorbian support. (#10431)

* Add support basic support for lower sorbian.

* Add some test for dsb.

* Update spacy/lang/dsb/examples.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
jnphilipp 2022-03-07 16:57:14 +01:00 committed by GitHub
parent 61ba5450ff
commit 5ca0dbae76
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 182 additions and 0 deletions

View File

@ -0,0 +1,16 @@
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
class LowerSorbianDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class LowerSorbian(Language):
lang = "dsb"
Defaults = LowerSorbianDefaults
__all__ = ["LowerSorbian"]

View File

@ -0,0 +1,15 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.dsb.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
"Mi so tu jara derje spodoba.",
"Kotre nowniny chceće měć?",
"Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
"Zwóstanjo pótakem hyšći wjele źěła."
]

View File

@ -0,0 +1,77 @@
from ...attrs import LIKE_NUM
_num_words = [
"nul",
"jaden", "jadna", "jadno",
"dwa", "dwě",
"tśi", "tśo",
"styri", "styrjo",
"pěś", "pěśo",
"šesć", "šesćo",
"sedym", "sedymjo",
"wósym", "wósymjo",
"źewjeś", "źewjeśo",
"źaseś", "źaseśo",
"jadnassćo",
"dwanassćo",
"tśinasćo",
"styrnasćo",
"pěśnasćo",
"šesnasćo",
"sedymnasćo",
"wósymnasćo",
"źewjeśnasćo",
"dwanasćo", "dwaźasća",
"tśiźasća",
"styrźasća",
"pěśźaset",
"šesćźaset",
"sedymźaset",
"wósymźaset",
"źewjeśźaset",
"sto",
"tysac",
"milion",
"miliarda",
"bilion",
"biliarda",
"trilion",
"triliarda",
]
_ordinal_words = [
"prědny", "prědna", "prědne",
"drugi", "druga", "druge",
"tśeśi", "tśeśa", "tśeśe",
"stwórty", "stwórta", "stwórte",
"pêty", "pěta", "pête",
"šesty", "šesta", "šeste",
"sedymy", "sedyma", "sedyme",
"wósymy", "wósyma", "wósyme",
"źewjety", "źewjeta", "źewjete",
"źasety", "źaseta", "źasete",
"jadnasty", "jadnasta", "jadnaste",
"dwanasty", "dwanasta", "dwanaste"
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,15 @@
STOP_WORDS = set(
"""
a abo aby ako ale
daniž dokulaž
gaž
jolic
pak pótom
teke togodla
""".split()
)

View File

@ -99,6 +99,11 @@ def de_vocab():
return get_lang_class("de")().vocab
@pytest.fixture(scope="session")
def dsb_tokenizer():
return get_lang_class("dsb")().tokenizer
@pytest.fixture(scope="session")
def el_tokenizer():
return get_lang_class("el")().tokenizer

View File

View File

@ -0,0 +1,25 @@
import pytest
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10,000", True),
("10,00", True),
("jadno", True),
("dwanassćo", True),
("milion", True),
("sto", True),
("ceła", False),
("kopica", False),
("narěcow", False),
(",", False),
("1/2", True),
],
)
def test_lex_attrs_like_number(dsb_tokenizer, text, match):
tokens = dsb_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match

View File

@ -0,0 +1,29 @@
import pytest
DSB_BASIC_TOKENIZATION_TESTS = [
(
"Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
[
"Ale",
"eksistěrujo",
"mimo",
"togo",
"ceła",
"kopica",
"narěcow",
",",
"ako",
"na",
"pśikład",
"slěpjańska",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
tokens = dsb_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list