mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Add Lower Sorbian support. (#10431)
* Add support basic support for lower sorbian. * Add some test for dsb. * Update spacy/lang/dsb/examples.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									61ba5450ff
								
							
						
					
					
						commit
						5ca0dbae76
					
				
							
								
								
									
										16
									
								
								spacy/lang/dsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								spacy/lang/dsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,16 @@ | |||
| from .lex_attrs import LEX_ATTRS | ||||
| from .stop_words import STOP_WORDS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| 
 | ||||
| class LowerSorbianDefaults(BaseDefaults): | ||||
|     lex_attr_getters = LEX_ATTRS | ||||
|     stop_words = STOP_WORDS | ||||
| 
 | ||||
| 
 | ||||
| class LowerSorbian(Language): | ||||
|     lang = "dsb" | ||||
|     Defaults = LowerSorbianDefaults | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["LowerSorbian"] | ||||
							
								
								
									
										15
									
								
								spacy/lang/dsb/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								spacy/lang/dsb/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | |||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.dsb.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Z tym stwori so wuměnjenje a zakład za dalše wobdźěłanje přez analyzu tekstoweje struktury a semantisku anotaciju a z tym tež za tu předstajenu digitalnu online-wersiju.", | ||||
|     "Mi so tu jara derje spodoba.", | ||||
|     "Kotre nowniny chceće měć?", | ||||
|     "Tak ako w slědnem lěśe jo teke lětosa jano doma zapustowaś móžno.", | ||||
|     "Zwóstanjo pótakem hyšći wjele źěła." | ||||
| ] | ||||
							
								
								
									
										77
									
								
								spacy/lang/dsb/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								spacy/lang/dsb/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,77 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| _num_words = [ | ||||
|     "nul", | ||||
|     "jaden", "jadna", "jadno", | ||||
|     "dwa", "dwě", | ||||
|     "tśi", "tśo", | ||||
|     "styri", "styrjo", | ||||
|     "pěś", "pěśo", | ||||
|     "šesć", "šesćo", | ||||
|     "sedym", "sedymjo", | ||||
|     "wósym", "wósymjo", | ||||
|     "źewjeś", "źewjeśo", | ||||
|     "źaseś", "źaseśo", | ||||
|     "jadnassćo", | ||||
|     "dwanassćo", | ||||
|     "tśinasćo", | ||||
|     "styrnasćo", | ||||
|     "pěśnasćo", | ||||
|     "šesnasćo", | ||||
|     "sedymnasćo", | ||||
|     "wósymnasćo", | ||||
|     "źewjeśnasćo", | ||||
|     "dwanasćo", "dwaźasća", | ||||
|     "tśiźasća", | ||||
|     "styrźasća", | ||||
|     "pěśźaset", | ||||
|     "šesćźaset", | ||||
|     "sedymźaset", | ||||
|     "wósymźaset", | ||||
|     "źewjeśźaset", | ||||
|     "sto", | ||||
|     "tysac", | ||||
|     "milion", | ||||
|     "miliarda", | ||||
|     "bilion", | ||||
|     "biliarda", | ||||
|     "trilion", | ||||
|     "triliarda", | ||||
| ] | ||||
| 
 | ||||
| _ordinal_words = [ | ||||
|     "prědny", "prědna", "prědne", | ||||
|     "drugi", "druga", "druge", | ||||
|     "tśeśi", "tśeśa", "tśeśe", | ||||
|     "stwórty", "stwórta", "stwórte", | ||||
|     "pêty", "pěta", "pête", | ||||
|     "šesty", "šesta", "šeste", | ||||
|     "sedymy", "sedyma", "sedyme", | ||||
|     "wósymy", "wósyma", "wósyme", | ||||
|     "źewjety", "źewjeta", "źewjete", | ||||
|     "źasety", "źaseta", "źasete", | ||||
|     "jadnasty", "jadnasta", "jadnaste", | ||||
|     "dwanasty", "dwanasta", "dwanaste" | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| def like_num(text): | ||||
|     if text.startswith(("+", "-", "±", "~")): | ||||
|         text = text[1:] | ||||
|     text = text.replace(",", "").replace(".", "") | ||||
|     if text.isdigit(): | ||||
|         return True | ||||
|     if text.count("/") == 1: | ||||
|         num, denom = text.split("/") | ||||
|         if num.isdigit() and denom.isdigit(): | ||||
|             return True | ||||
|     text_lower = text.lower() | ||||
|     if text_lower in _num_words: | ||||
|         return True | ||||
|     # Check ordinal number | ||||
|     if text_lower in _ordinal_words: | ||||
|         return True | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| LEX_ATTRS = {LIKE_NUM: like_num} | ||||
							
								
								
									
										15
									
								
								spacy/lang/dsb/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								spacy/lang/dsb/stop_words.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | |||
| STOP_WORDS = set( | ||||
|     """ | ||||
| a abo aby ako ale až | ||||
| 
 | ||||
| daniž dokulaž | ||||
| 
 | ||||
| gaž | ||||
| 
 | ||||
| jolic | ||||
| 
 | ||||
| pak pótom | ||||
| 
 | ||||
| teke togodla | ||||
| """.split() | ||||
| ) | ||||
|  | @ -99,6 +99,11 @@ def de_vocab(): | |||
|     return get_lang_class("de")().vocab | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def dsb_tokenizer(): | ||||
|     return get_lang_class("dsb")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def el_tokenizer(): | ||||
|     return get_lang_class("el")().tokenizer | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/lang/dsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/dsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										25
									
								
								spacy/tests/lang/dsb/test_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								spacy/tests/lang/dsb/test_text.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,25 @@ | |||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "text,match", | ||||
|     [ | ||||
|         ("10", True), | ||||
|         ("1", True), | ||||
|         ("10,000", True), | ||||
|         ("10,00", True), | ||||
|         ("jadno", True), | ||||
|         ("dwanassćo", True), | ||||
|         ("milion", True), | ||||
|         ("sto", True), | ||||
|         ("ceła", False), | ||||
|         ("kopica", False), | ||||
|         ("narěcow", False), | ||||
|         (",", False), | ||||
|         ("1/2", True), | ||||
|     ], | ||||
| ) | ||||
| def test_lex_attrs_like_number(dsb_tokenizer, text, match): | ||||
|     tokens = dsb_tokenizer(text) | ||||
|     assert len(tokens) == 1 | ||||
|     assert tokens[0].like_num == match | ||||
							
								
								
									
										29
									
								
								spacy/tests/lang/dsb/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								spacy/tests/lang/dsb/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,29 @@ | |||
| import pytest | ||||
| 
 | ||||
| DSB_BASIC_TOKENIZATION_TESTS = [ | ||||
|     ( | ||||
|         "Ale eksistěrujo mimo togo ceła kopica narěcow, ako na pśikład slěpjańska.", | ||||
|         [ | ||||
|             "Ale", | ||||
|             "eksistěrujo", | ||||
|             "mimo", | ||||
|             "togo", | ||||
|             "ceła", | ||||
|             "kopica", | ||||
|             "narěcow", | ||||
|             ",", | ||||
|             "ako", | ||||
|             "na", | ||||
|             "pśikład", | ||||
|             "slěpjańska", | ||||
|             ".", | ||||
|         ], | ||||
|     ), | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,expected_tokens", DSB_BASIC_TOKENIZATION_TESTS) | ||||
| def test_dsb_tokenizer_basic(dsb_tokenizer, text, expected_tokens): | ||||
|     tokens = dsb_tokenizer(text) | ||||
|     token_list = [token.text for token in tokens if not token.is_space] | ||||
|     assert expected_tokens == token_list | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user