mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Add Lower Sorbian support. (#10431)
* Add support basic support for lower sorbian. * Add some test for dsb. * Update spacy/lang/dsb/examples.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
parent
61ba5450ff
commit
5ca0dbae76
16
spacy/lang/dsb/__init__.py
Normal file
16
spacy/lang/dsb/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class LowerSorbianDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class LowerSorbian(Language):
|
||||
lang = "dsb"
|
||||
Defaults = LowerSorbianDefaults
|
||||
|
||||
|
||||
__all__ = ["LowerSorbian"]
|
15
spacy/lang/dsb/examples.py
Normal file
15
spacy/lang/dsb/examples.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.dsb.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
|
||||
"Mi so tu jara derje spodoba.",
|
||||
"Kotre nowniny chceće měć?",
|
||||
"Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
|
||||
"Zwóstanjo pótakem hyšći wjele źěła."
|
||||
]
|
77
spacy/lang/dsb/lex_attrs.py
Normal file
77
spacy/lang/dsb/lex_attrs.py
Normal file
|
@ -0,0 +1,77 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"nul",
|
||||
"jaden", "jadna", "jadno",
|
||||
"dwa", "dwě",
|
||||
"tśi", "tśo",
|
||||
"styri", "styrjo",
|
||||
"pěś", "pěśo",
|
||||
"šesć", "šesćo",
|
||||
"sedym", "sedymjo",
|
||||
"wósym", "wósymjo",
|
||||
"źewjeś", "źewjeśo",
|
||||
"źaseś", "źaseśo",
|
||||
"jadnassćo",
|
||||
"dwanassćo",
|
||||
"tśinasćo",
|
||||
"styrnasćo",
|
||||
"pěśnasćo",
|
||||
"šesnasćo",
|
||||
"sedymnasćo",
|
||||
"wósymnasćo",
|
||||
"źewjeśnasćo",
|
||||
"dwanasćo", "dwaźasća",
|
||||
"tśiźasća",
|
||||
"styrźasća",
|
||||
"pěśźaset",
|
||||
"šesćźaset",
|
||||
"sedymźaset",
|
||||
"wósymźaset",
|
||||
"źewjeśźaset",
|
||||
"sto",
|
||||
"tysac",
|
||||
"milion",
|
||||
"miliarda",
|
||||
"bilion",
|
||||
"biliarda",
|
||||
"trilion",
|
||||
"triliarda",
|
||||
]
|
||||
|
||||
_ordinal_words = [
|
||||
"prědny", "prědna", "prědne",
|
||||
"drugi", "druga", "druge",
|
||||
"tśeśi", "tśeśa", "tśeśe",
|
||||
"stwórty", "stwórta", "stwórte",
|
||||
"pêty", "pěta", "pête",
|
||||
"šesty", "šesta", "šeste",
|
||||
"sedymy", "sedyma", "sedyme",
|
||||
"wósymy", "wósyma", "wósyme",
|
||||
"źewjety", "źewjeta", "źewjete",
|
||||
"źasety", "źaseta", "źasete",
|
||||
"jadnasty", "jadnasta", "jadnaste",
|
||||
"dwanasty", "dwanasta", "dwanaste"
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
15
spacy/lang/dsb/stop_words.py
Normal file
15
spacy/lang/dsb/stop_words.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
a abo aby ako ale až
|
||||
|
||||
daniž dokulaž
|
||||
|
||||
gaž
|
||||
|
||||
jolic
|
||||
|
||||
pak pótom
|
||||
|
||||
teke togodla
|
||||
""".split()
|
||||
)
|
|
@ -99,6 +99,11 @@ def de_vocab():
|
|||
return get_lang_class("de")().vocab
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def dsb_tokenizer():
|
||||
return get_lang_class("dsb")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def el_tokenizer():
|
||||
return get_lang_class("el")().tokenizer
|
||||
|
|
0
spacy/tests/lang/dsb/__init__.py
Normal file
0
spacy/tests/lang/dsb/__init__.py
Normal file
25
spacy/tests/lang/dsb/test_text.py
Normal file
25
spacy/tests/lang/dsb/test_text.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("10,000", True),
|
||||
("10,00", True),
|
||||
("jadno", True),
|
||||
("dwanassćo", True),
|
||||
("milion", True),
|
||||
("sto", True),
|
||||
("ceła", False),
|
||||
("kopica", False),
|
||||
("narěcow", False),
|
||||
(",", False),
|
||||
("1/2", True),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(dsb_tokenizer, text, match):
|
||||
tokens = dsb_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
29
spacy/tests/lang/dsb/test_tokenizer.py
Normal file
29
spacy/tests/lang/dsb/test_tokenizer.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
import pytest
|
||||
|
||||
DSB_BASIC_TOKENIZATION_TESTS = [
|
||||
(
|
||||
"Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
|
||||
[
|
||||
"Ale",
|
||||
"eksistěrujo",
|
||||
"mimo",
|
||||
"togo",
|
||||
"ceła",
|
||||
"kopica",
|
||||
"narěcow",
|
||||
",",
|
||||
"ako",
|
||||
"na",
|
||||
"pśikład",
|
||||
"slěpjańska",
|
||||
".",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
|
||||
def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
|
||||
tokens = dsb_tokenizer(text)
|
||||
token_list = [token.text for token in tokens if not token.is_space]
|
||||
assert expected_tokens == token_list
|
Loading…
Reference in New Issue
Block a user