Add Upper Sorbian support. (#10432)

* Add support basic support for upper sorbian.

* Add tokenizer exceptions and tests.

* Update spacy/lang/hsb/examples.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
jnphilipp 2022-03-07 16:20:39 +01:00 committed by GitHub
parent a6d5824e5f
commit 7ed7908716
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 209 additions and 0 deletions

View File

@ -0,0 +1,18 @@
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language, BaseDefaults
class UpperSorbianDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
class UpperSorbian(Language):
lang = "hsb"
Defaults = UpperSorbianDefaults
__all__ = ["UpperSorbian"]

View File

@ -0,0 +1,15 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.hsb.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
"Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
"A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
"Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
"Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej."
]

View File

@ -0,0 +1,77 @@
from ...attrs import LIKE_NUM
_num_words = [
"nul",
"jedyn", "jedna", "jedne",
"dwaj", "dwě",
"tři", "třo",
"štyri", "štyrjo",
"pjeć",
"šěsć",
"sydom",
"wosom",
"dźewjeć",
"dźesać",
"jědnaće",
"dwanaće",
"třinaće",
"štyrnaće",
"pjatnaće",
"šěsnaće",
"sydomnaće",
"wosomnaće",
"dźewjatnaće",
"dwaceći"
"třiceći",
"štyrceći",
"pjećdźesat",
"šěsćdźesat",
"sydomdźesat",
"wosomdźesat",
"dźewjećdźesat",
"sto",
"tysac",
"milion",
"miliarda",
"bilion",
"biliarda",
"trilion",
"triliarda",
]
_ordinal_words = [
"prěni", "prěnja", "prěnje",
"druhi", "druha", "druhe",
"třeći", "třeća", "třeće",
"štwórty", "štwórta", "štwórte",
"pjaty", "pjata", "pjate",
"šěsty", "šěsta", "šěste",
"sydmy", "sydma", "sydme",
"wosmy", "wosma", "wosme",
"dźewjaty", "dźewjata", "dźewjate",
"dźesaty", "dźesata", "dźesate",
"jědnaty", "jědnata", "jědnate",
"dwanaty", "dwanata", "dwanate"
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,19 @@
STOP_WORDS = set(
"""
a abo ale ani
dokelž
hdyž
jeli jelizo
kaž
pak potom
tež tohodla
zo zoby
""".split()
)

View File

@ -0,0 +1,18 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
_exc = dict()
for exc_data in [
{ORTH: "mil.", NORM: "milion"},
{ORTH: "wob.", NORM: "wobydler"},
]:
_exc[exc_data[ORTH]] = [exc_data]
for orth in [
"resp.",
]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)

View File

@ -221,6 +221,11 @@ def ja_tokenizer():
return get_lang_class("ja")().tokenizer
@pytest.fixture(scope="session")
def hsb_tokenizer():
return get_lang_class("hsb")().tokenizer
@pytest.fixture(scope="session")
def ko_tokenizer():
pytest.importorskip("natto")

View File

View File

@ -0,0 +1,25 @@
import pytest
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10,000", True),
("10,00", True),
("jedne", True),
("dwanaće", True),
("milion", True),
("sto", True),
("załožene", False),
("wona", False),
("powšitkownej", False),
(",", False),
("1/2", True),
],
)
def test_lex_attrs_like_number(hsb_tokenizer, text, match):
tokens = hsb_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match

View File

@ -0,0 +1,32 @@
import pytest
HSB_BASIC_TOKENIZATION_TESTS = [
(
"Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.",
[
"Hornjoserbšćina",
"wobsteji",
"resp.",
"wobsteješe",
"z",
"wjacorych",
"dialektow",
",",
"kotrež",
"so",
"zdźěla",
"chětro",
"wot",
"so",
"rozeznawachu",
".",
],
),
]
@pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS)
def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens):
tokens = hsb_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list