mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Add Upper Sorbian support. (#10432)
* Add support basic support for upper sorbian. * Add tokenizer exceptions and tests. * Update spacy/lang/hsb/examples.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									a6d5824e5f
								
							
						
					
					
						commit
						7ed7908716
					
				
							
								
								
									
										18
									
								
								spacy/lang/hsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/hsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| from .lex_attrs import LEX_ATTRS | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||
| from ...language import Language, BaseDefaults | ||||
| 
 | ||||
| 
 | ||||
| class UpperSorbianDefaults(BaseDefaults): | ||||
|     lex_attr_getters = LEX_ATTRS | ||||
|     stop_words = STOP_WORDS | ||||
|     tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||||
| 
 | ||||
| 
 | ||||
| class UpperSorbian(Language): | ||||
|     lang = "hsb" | ||||
|     Defaults = UpperSorbianDefaults | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ["UpperSorbian"] | ||||
							
								
								
									
										15
									
								
								spacy/lang/hsb/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								spacy/lang/hsb/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | |||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.hsb.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "To běšo wjelgin raźone a jo se wót luźi derje pśiwzeło. Tak som dožywiła wjelgin", | ||||
|     "Jogo pśewóźowarce stej groniłej, až how w serbskich stronach njama Santa Claus nic pytaś.", | ||||
|     "A ten sobuźěłaśeŕ Statneje biblioteki w Barlinju jo pśimjeł drogotne knigły bźez rukajcowu z nagima rukoma!", | ||||
|     "Take wobchadanje z našym kulturnym derbstwom zewšym njejźo.", | ||||
|     "Wopśimjeśe drugich pśinoskow jo było na wusokem niwowje, ako pśecej." | ||||
| ] | ||||
							
								
								
									
										77
									
								
								spacy/lang/hsb/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								spacy/lang/hsb/lex_attrs.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,77 @@ | |||
| from ...attrs import LIKE_NUM | ||||
| 
 | ||||
| _num_words = [ | ||||
|     "nul", | ||||
|     "jedyn", "jedna", "jedne", | ||||
|     "dwaj", "dwě", | ||||
|     "tři", "třo", | ||||
|     "štyri", "štyrjo", | ||||
|     "pjeć", | ||||
|     "šěsć", | ||||
|     "sydom", | ||||
|     "wosom", | ||||
|     "dźewjeć", | ||||
|     "dźesać", | ||||
|     "jědnaće", | ||||
|     "dwanaće", | ||||
|     "třinaće", | ||||
|     "štyrnaće", | ||||
|     "pjatnaće", | ||||
|     "šěsnaće", | ||||
|     "sydomnaće", | ||||
|     "wosomnaće", | ||||
|     "dźewjatnaće", | ||||
|     "dwaceći" | ||||
|     "třiceći", | ||||
|     "štyrceći", | ||||
|     "pjećdźesat", | ||||
|     "šěsćdźesat", | ||||
|     "sydomdźesat", | ||||
|     "wosomdźesat", | ||||
|     "dźewjećdźesat", | ||||
|     "sto", | ||||
|     "tysac", | ||||
|     "milion", | ||||
|     "miliarda", | ||||
|     "bilion", | ||||
|     "biliarda", | ||||
|     "trilion", | ||||
|     "triliarda", | ||||
| ] | ||||
| 
 | ||||
| _ordinal_words = [ | ||||
|     "prěni", "prěnja", "prěnje", | ||||
|     "druhi", "druha", "druhe", | ||||
|     "třeći", "třeća", "třeće", | ||||
|     "štwórty", "štwórta", "štwórte", | ||||
|     "pjaty", "pjata", "pjate", | ||||
|     "šěsty", "šěsta", "šěste", | ||||
|     "sydmy", "sydma", "sydme", | ||||
|     "wosmy", "wosma", "wosme", | ||||
|     "dźewjaty", "dźewjata", "dźewjate", | ||||
|     "dźesaty", "dźesata", "dźesate", | ||||
|     "jědnaty", "jědnata", "jědnate", | ||||
|     "dwanaty", "dwanata", "dwanate" | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| def like_num(text): | ||||
|     if text.startswith(("+", "-", "±", "~")): | ||||
|         text = text[1:] | ||||
|     text = text.replace(",", "").replace(".", "") | ||||
|     if text.isdigit(): | ||||
|         return True | ||||
|     if text.count("/") == 1: | ||||
|         num, denom = text.split("/") | ||||
|         if num.isdigit() and denom.isdigit(): | ||||
|             return True | ||||
|     text_lower = text.lower() | ||||
|     if text_lower in _num_words: | ||||
|         return True | ||||
|     # Check ordinal number | ||||
|     if text_lower in _ordinal_words: | ||||
|         return True | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| LEX_ATTRS = {LIKE_NUM: like_num} | ||||
							
								
								
									
										19
									
								
								spacy/lang/hsb/stop_words.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								spacy/lang/hsb/stop_words.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | |||
| STOP_WORDS = set( | ||||
|     """ | ||||
| a abo ale ani | ||||
| 
 | ||||
| dokelž | ||||
| 
 | ||||
| hdyž | ||||
| 
 | ||||
| jeli jelizo | ||||
| 
 | ||||
| kaž | ||||
| 
 | ||||
| pak potom | ||||
| 
 | ||||
| tež tohodla | ||||
| 
 | ||||
| zo zoby | ||||
| """.split() | ||||
| ) | ||||
							
								
								
									
										18
									
								
								spacy/lang/hsb/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/hsb/tokenizer_exceptions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| from ..tokenizer_exceptions import BASE_EXCEPTIONS | ||||
| from ...symbols import ORTH, NORM | ||||
| from ...util import update_exc | ||||
| 
 | ||||
| _exc = dict() | ||||
| for exc_data in [ | ||||
|     {ORTH: "mil.", NORM: "milion"}, | ||||
|     {ORTH: "wob.", NORM: "wobydler"}, | ||||
| ]: | ||||
|     _exc[exc_data[ORTH]] = [exc_data] | ||||
| 
 | ||||
| for orth in [ | ||||
|     "resp.", | ||||
| ]: | ||||
|     _exc[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) | ||||
|  | @ -221,6 +221,11 @@ def ja_tokenizer(): | |||
|     return get_lang_class("ja")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def hsb_tokenizer(): | ||||
|     return get_lang_class("hsb")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def ko_tokenizer(): | ||||
|     pytest.importorskip("natto") | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/lang/hsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/lang/hsb/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										25
									
								
								spacy/tests/lang/hsb/test_text.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								spacy/tests/lang/hsb/test_text.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,25 @@ | |||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     "text,match", | ||||
|     [ | ||||
|         ("10", True), | ||||
|         ("1", True), | ||||
|         ("10,000", True), | ||||
|         ("10,00", True), | ||||
|         ("jedne", True), | ||||
|         ("dwanaće", True), | ||||
|         ("milion", True), | ||||
|         ("sto", True), | ||||
|         ("załožene", False), | ||||
|         ("wona", False), | ||||
|         ("powšitkownej", False), | ||||
|         (",", False), | ||||
|         ("1/2", True), | ||||
|     ], | ||||
| ) | ||||
| def test_lex_attrs_like_number(hsb_tokenizer, text, match): | ||||
|     tokens = hsb_tokenizer(text) | ||||
|     assert len(tokens) == 1 | ||||
|     assert tokens[0].like_num == match | ||||
							
								
								
									
										32
									
								
								spacy/tests/lang/hsb/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								spacy/tests/lang/hsb/test_tokenizer.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,32 @@ | |||
| import pytest | ||||
| 
 | ||||
| HSB_BASIC_TOKENIZATION_TESTS = [ | ||||
|     ( | ||||
|         "Hornjoserbšćina wobsteji resp. wobsteješe z wjacorych dialektow, kotrež so zdźěla chětro wot so rozeznawachu.", | ||||
|         [ | ||||
|             "Hornjoserbšćina", | ||||
|             "wobsteji", | ||||
|             "resp.", | ||||
|             "wobsteješe", | ||||
|             "z", | ||||
|             "wjacorych", | ||||
|             "dialektow", | ||||
|             ",", | ||||
|             "kotrež", | ||||
|             "so", | ||||
|             "zdźěla", | ||||
|             "chětro", | ||||
|             "wot", | ||||
|             "so", | ||||
|             "rozeznawachu", | ||||
|             ".", | ||||
|         ], | ||||
|     ), | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,expected_tokens", HSB_BASIC_TOKENIZATION_TESTS) | ||||
| def test_hsb_tokenizer_basic(hsb_tokenizer, text, expected_tokens): | ||||
|     tokens = hsb_tokenizer(text) | ||||
|     token_list = [token.text for token in tokens if not token.is_space] | ||||
|     assert expected_tokens == token_list | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user