mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
Add Upper Sorbian support. (#10432)
* Add support basic support for upper sorbian. * Add tokenizer exceptions and tests. * Update spacy/lang/hsb/examples.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
a6d5824e5f
commit
7ed7908716
18
spacy/lang/hsb/__init__.py
Normal file
18
spacy/lang/hsb/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class UpperSorbianDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class UpperSorbian(Language):
|
||||
lang = "hsb"
|
||||
Defaults = UpperSorbianDefaults
|
||||
|
||||
|
||||
__all__ = ["UpperSorbian"]
|
15
spacy/lang/hsb/examples.py
Normal file
15
spacy/lang/hsb/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.hsb.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin",
|
||||
"Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.",
|
||||
"A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!",
|
||||
"Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.",
|
||||
"Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej."
|
||||
]
|
77
spacy/lang/hsb/lex_attrs.py
Normal file
77
spacy/lang/hsb/lex_attrs.py
Normal file
|
@ -0,0 +1,77 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"nul",
|
||||
"jedyn", "jedna", "jedne",
|
||||
"dwaj", "dwě",
|
||||
"tři", "třo",
|
||||
"štyri", "štyrjo",
|
||||
"pjeć",
|
||||
"šěsć",
|
||||
"sydom",
|
||||
"wosom",
|
||||
"dźewjeć",
|
||||
"dźesać",
|
||||
"jědnaće",
|
||||
"dwanaće",
|
||||
"třinaće",
|
||||
"štyrnaće",
|
||||
"pjatnaće",
|
||||
"šěsnaće",
|
||||
"sydomnaće",
|
||||
"wosomnaće",
|
||||
"dźewjatnaće",
|
||||
"dwaceći"
|
||||
"třiceći",
|
||||
"štyrceći",
|
||||
"pjećdźesat",
|
||||
"šěsćdźesat",
|
||||
"sydomdźesat",
|
||||
"wosomdźesat",
|
||||
"dźewjećdźesat",
|
||||
"sto",
|
||||
"tysac",
|
||||
"milion",
|
||||
"miliarda",
|
||||
"bilion",
|
||||
"biliarda",
|
||||
"trilion",
|
||||
"triliarda",
|
||||
]
|
||||
|
||||
_ordinal_words = [
|
||||
"prěni", "prěnja", "prěnje",
|
||||
"druhi", "druha", "druhe",
|
||||
"třeći", "třeća", "třeće",
|
||||
"štwórty", "štwórta", "štwórte",
|
||||
"pjaty", "pjata", "pjate",
|
||||
"šěsty", "šěsta", "šěste",
|
||||
"sydmy", "sydma", "sydme",
|
||||
"wosmy", "wosma", "wosme",
|
||||
"dźewjaty", "dźewjata", "dźewjate",
|
||||
"dźesaty", "dźesata", "dźesate",
|
||||
"jědnaty", "jědnata", "jědnate",
|
||||
"dwanaty", "dwanata", "dwanate"
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
19
spacy/lang/hsb/stop_words.py
Normal file
19
spacy/lang/hsb/stop_words.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
a abo ale ani
|
||||
|
||||
dokelž
|
||||
|
||||
hdyž
|
||||
|
||||
jeli jelizo
|
||||
|
||||
kaž
|
||||
|
||||
pak potom
|
||||
|
||||
tež tohodla
|
||||
|
||||
zo zoby
|
||||
""".split()
|
||||
)
|
18
spacy/lang/hsb/tokenizer_exceptions.py
Normal file
18
spacy/lang/hsb/tokenizer_exceptions.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
_exc = dict()
|
||||
for exc_data in [
|
||||
{ORTH: "mil.", NORM: "milion"},
|
||||
{ORTH: "wob.", NORM: "wobydler"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
for orth in [
|
||||
"resp.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
@ -221,6 +221,11 @@ def ja_tokenizer():
|
|||
return get_lang_class("ja")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def hsb_tokenizer():
|
||||
return get_lang_class("hsb")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer():
|
||||
pytest.importorskip("natto")
|
||||
|
|
0
spacy/tests/lang/hsb/__init__.py
Normal file
0
spacy/tests/lang/hsb/__init__.py
Normal file
25
spacy/tests/lang/hsb/test_text.py
Normal file
25
spacy/tests/lang/hsb/test_text.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10,000", True),
|
||||
("10,00", True),
|
||||
("jedne", True),
|
||||
("dwanaće", True),
|
||||
("milion", True),
|
||||
("sto", True),
|
||||
("załožene", False),
|
||||
("wona", False),
|
||||
("powšitkownej", False),
|
||||
(",", False),
|
||||
("1/2", True),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(hsb_tokenizer, text, match):
|
||||
tokens = hsb_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
32
spacy/tests/lang/hsb/test_tokenizer.py
Normal file
32
spacy/tests/lang/hsb/test_tokenizer.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
import pytest
|
||||
|
||||
HSB_BASIC_TOKENIZATION_TESTS = [
|
||||
(
|
||||
"Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.",
|
||||
[
|
||||
"Hornjoserbšćina",
|
||||
"wobsteji",
|
||||
"resp.",
|
||||
"wobsteješe",
|
||||
"z",
|
||||
"wjacorych",
|
||||
"dialektow",
|
||||
",",
|
||||
"kotrež",
|
||||
"so",
|
||||
"zdźěla",
|
||||
"chětro",
|
||||
"wot",
|
||||
"so",
|
||||
"rozeznawachu",
|
||||
".",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS)
|
||||
def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens):
|
||||
tokens = hsb_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
Loading…
Reference in New Issue
Block a user