mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Add Lower Sorbian support. (#10431)
* Add support basic support for lower sorbian. * Add some test for dsb. * Update spacy/lang/dsb/examples.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									61ba5450ff
								
							
						
					
					
						commit
						5ca0dbae76
					
				
							
								
								
									
										16
									
								
								spacy/lang/dsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								spacy/lang/dsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,16 @@
 | 
			
		|||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language, BaseDefaults
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LowerSorbianDefaults(BaseDefaults):
 | 
			
		||||
    lex_attr_getters = LEX_ATTRS
 | 
			
		||||
    stop_words = STOP_WORDS
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class LowerSorbian(Language):
 | 
			
		||||
    lang = "dsb"
 | 
			
		||||
    Defaults = LowerSorbianDefaults
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["LowerSorbian"]
 | 
			
		||||
							
								
								
									
										15
									
								
								spacy/lang/dsb/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								spacy/lang/dsb/examples.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,15 @@
 | 
			
		|||
"""
 | 
			
		||||
Example sentences to test spaCy and its language models.
 | 
			
		||||
 | 
			
		||||
>>> from spacy.lang.dsb.examples import sentences
 | 
			
		||||
>>> docs = nlp.pipe(sentences)
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
sentences = [
 | 
			
		||||
    "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.",
 | 
			
		||||
    "Mi so tu jara derje spodoba.",
 | 
			
		||||
    "Kotre nowniny chceće měć?",
 | 
			
		||||
    "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.",
 | 
			
		||||
    "Zwóstanjo pótakem hyšći wjele źěła."
 | 
			
		||||
]
 | 
			
		||||
							
								
								
									
										77
									
								
								spacy/lang/dsb/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								spacy/lang/dsb/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,77 @@
 | 
			
		|||
from ...attrs import LIKE_NUM
 | 
			
		||||
 | 
			
		||||
_num_words = [
 | 
			
		||||
    "nul",
 | 
			
		||||
    "jaden", "jadna", "jadno",
 | 
			
		||||
    "dwa", "dwě",
 | 
			
		||||
    "tśi", "tśo",
 | 
			
		||||
    "styri", "styrjo",
 | 
			
		||||
    "pěś", "pěśo",
 | 
			
		||||
    "šesć", "šesćo",
 | 
			
		||||
    "sedym", "sedymjo",
 | 
			
		||||
    "wósym", "wósymjo",
 | 
			
		||||
    "źewjeś", "źewjeśo",
 | 
			
		||||
    "źaseś", "źaseśo",
 | 
			
		||||
    "jadnassćo",
 | 
			
		||||
    "dwanassćo",
 | 
			
		||||
    "tśinasćo",
 | 
			
		||||
    "styrnasćo",
 | 
			
		||||
    "pěśnasćo",
 | 
			
		||||
    "šesnasćo",
 | 
			
		||||
    "sedymnasćo",
 | 
			
		||||
    "wósymnasćo",
 | 
			
		||||
    "źewjeśnasćo",
 | 
			
		||||
    "dwanasćo", "dwaźasća",
 | 
			
		||||
    "tśiźasća",
 | 
			
		||||
    "styrźasća",
 | 
			
		||||
    "pěśźaset",
 | 
			
		||||
    "šesćźaset",
 | 
			
		||||
    "sedymźaset",
 | 
			
		||||
    "wósymźaset",
 | 
			
		||||
    "źewjeśźaset",
 | 
			
		||||
    "sto",
 | 
			
		||||
    "tysac",
 | 
			
		||||
    "milion",
 | 
			
		||||
    "miliarda",
 | 
			
		||||
    "bilion",
 | 
			
		||||
    "biliarda",
 | 
			
		||||
    "trilion",
 | 
			
		||||
    "triliarda",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
_ordinal_words = [
 | 
			
		||||
    "prědny", "prědna", "prědne",
 | 
			
		||||
    "drugi", "druga", "druge",
 | 
			
		||||
    "tśeśi", "tśeśa", "tśeśe",
 | 
			
		||||
    "stwórty", "stwórta", "stwórte",
 | 
			
		||||
    "pêty", "pěta", "pête",
 | 
			
		||||
    "šesty", "šesta", "šeste",
 | 
			
		||||
    "sedymy", "sedyma", "sedyme",
 | 
			
		||||
    "wósymy", "wósyma", "wósyme",
 | 
			
		||||
    "źewjety", "źewjeta", "źewjete",
 | 
			
		||||
    "źasety", "źaseta", "źasete",
 | 
			
		||||
    "jadnasty", "jadnasta", "jadnaste",
 | 
			
		||||
    "dwanasty", "dwanasta", "dwanaste"
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def like_num(text):
 | 
			
		||||
    if text.startswith(("+", "-", "±", "~")):
 | 
			
		||||
        text = text[1:]
 | 
			
		||||
    text = text.replace(",", "").replace(".", "")
 | 
			
		||||
    if text.isdigit():
 | 
			
		||||
        return True
 | 
			
		||||
    if text.count("/") == 1:
 | 
			
		||||
        num, denom = text.split("/")
 | 
			
		||||
        if num.isdigit() and denom.isdigit():
 | 
			
		||||
            return True
 | 
			
		||||
    text_lower = text.lower()
 | 
			
		||||
    if text_lower in _num_words:
 | 
			
		||||
        return True
 | 
			
		||||
    # Check ordinal number
 | 
			
		||||
    if text_lower in _ordinal_words:
 | 
			
		||||
        return True
 | 
			
		||||
    return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
LEX_ATTRS = {LIKE_NUM: like_num}
 | 
			
		||||
							
								
								
									
										15
									
								
								spacy/lang/dsb/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								spacy/lang/dsb/stop_words.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,15 @@
 | 
			
		|||
STOP_WORDS = set(
 | 
			
		||||
    """
 | 
			
		||||
a abo aby ako ale až
 | 
			
		||||
 | 
			
		||||
daniž dokulaž
 | 
			
		||||
 | 
			
		||||
gaž
 | 
			
		||||
 | 
			
		||||
jolic
 | 
			
		||||
 | 
			
		||||
pak pótom
 | 
			
		||||
 | 
			
		||||
teke togodla
 | 
			
		||||
""".split()
 | 
			
		||||
)
 | 
			
		||||
| 
						 | 
				
			
			@ -99,6 +99,11 @@ def de_vocab():
 | 
			
		|||
    return get_lang_class("de")().vocab
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def dsb_tokenizer():
 | 
			
		||||
    return get_lang_class("dsb")().tokenizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def el_tokenizer():
 | 
			
		||||
    return get_lang_class("el")().tokenizer
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										0
									
								
								spacy/tests/lang/dsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/dsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										25
									
								
								spacy/tests/lang/dsb/test_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								spacy/tests/lang/dsb/test_text.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,25 @@
 | 
			
		|||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "text,match",
 | 
			
		||||
    [
 | 
			
		||||
        ("10", True),
 | 
			
		||||
        ("1", True),
 | 
			
		||||
        ("10,000", True),
 | 
			
		||||
        ("10,00", True),
 | 
			
		||||
        ("jadno", True),
 | 
			
		||||
        ("dwanassćo", True),
 | 
			
		||||
        ("milion", True),
 | 
			
		||||
        ("sto", True),
 | 
			
		||||
        ("ceła", False),
 | 
			
		||||
        ("kopica", False),
 | 
			
		||||
        ("narěcow", False),
 | 
			
		||||
        (",", False),
 | 
			
		||||
        ("1/2", True),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_lex_attrs_like_number(dsb_tokenizer, text, match):
 | 
			
		||||
    tokens = dsb_tokenizer(text)
 | 
			
		||||
    assert len(tokens) == 1
 | 
			
		||||
    assert tokens[0].like_num == match
 | 
			
		||||
							
								
								
									
										29
									
								
								spacy/tests/lang/dsb/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								spacy/tests/lang/dsb/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,29 @@
 | 
			
		|||
import pytest
 | 
			
		||||
 | 
			
		||||
DSB_BASIC_TOKENIZATION_TESTS = [
 | 
			
		||||
    (
 | 
			
		||||
        "Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.",
 | 
			
		||||
        [
 | 
			
		||||
            "Ale",
 | 
			
		||||
            "eksistěrujo",
 | 
			
		||||
            "mimo",
 | 
			
		||||
            "togo",
 | 
			
		||||
            "ceła",
 | 
			
		||||
            "kopica",
 | 
			
		||||
            "narěcow",
 | 
			
		||||
            ",",
 | 
			
		||||
            "ako",
 | 
			
		||||
            "na",
 | 
			
		||||
            "pśikład",
 | 
			
		||||
            "slěpjańska",
 | 
			
		||||
            ".",
 | 
			
		||||
        ],
 | 
			
		||||
    ),
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS)
 | 
			
		||||
def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens):
 | 
			
		||||
    tokens = dsb_tokenizer(text)
 | 
			
		||||
    token_list = [token.text for token in tokens if not token.is_space]
 | 
			
		||||
    assert expected_tokens == token_list
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user