mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Update Korean defaults for Tokenizer (#10322)
Update Korean defaults for `Tokenizer` for tokenization following UD Korean Kaist.
This commit is contained in:
		
							parent
							
								
									f32ee2e533
								
							
						
					
					
						commit
						30030176ee
					
				|  | @ -1,5 +1,6 @@ | |||
| from typing import Iterator, Any, Dict | ||||
| 
 | ||||
| from .punctuation import TOKENIZER_INFIXES | ||||
| from .stop_words import STOP_WORDS | ||||
| from .tag_map import TAG_MAP | ||||
| from .lex_attrs import LEX_ATTRS | ||||
|  | @ -85,6 +86,7 @@ class KoreanDefaults(BaseDefaults): | |||
|     lex_attr_getters = LEX_ATTRS | ||||
|     stop_words = STOP_WORDS | ||||
|     writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} | ||||
|     infixes = TOKENIZER_INFIXES | ||||
| 
 | ||||
| 
 | ||||
| class Korean(Language): | ||||
|  |  | |||
							
								
								
									
										12
									
								
								spacy/lang/ko/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								spacy/lang/ko/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,12 @@ | |||
| from ..char_classes import LIST_QUOTES | ||||
| from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES | ||||
| 
 | ||||
| 
 | ||||
| _infixes = ( | ||||
|     ["·", "ㆍ", "\(", "\)"] | ||||
|     + [r"(?<=[0-9])~(?=[0-9-])"] | ||||
|     + LIST_QUOTES | ||||
|     + BASE_TOKENIZER_INFIXES | ||||
| ) | ||||
| 
 | ||||
| TOKENIZER_INFIXES = _infixes | ||||
|  | @ -227,6 +227,19 @@ def ko_tokenizer(): | |||
|     return get_lang_class("ko")().tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def ko_tokenizer_tokenizer(): | ||||
|     config = { | ||||
|         "nlp": { | ||||
|             "tokenizer": { | ||||
|                 "@tokenizers": "spacy.Tokenizer.v1", | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     nlp = get_lang_class("ko").from_config(config) | ||||
|     return nlp.tokenizer | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(scope="session") | ||||
| def lb_tokenizer(): | ||||
|     return get_lang_class("lb")().tokenizer | ||||
|  |  | |||
|  | @ -47,3 +47,23 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): | |||
| def test_ko_empty_doc(ko_tokenizer): | ||||
|     tokens = ko_tokenizer("") | ||||
|     assert len(tokens) == 0 | ||||
| 
 | ||||
| 
 | ||||
| # fmt: off | ||||
| SPACY_TOKENIZER_TESTS = [ | ||||
|     ("있다.", "있다 ."), | ||||
|     ("'예'는", "' 예 ' 는"), | ||||
|     ("부 (富) 는", "부 ( 富 ) 는"), | ||||
|     ("부(富)는", "부 ( 富 ) 는"), | ||||
|     ("1982~1983.", "1982 ~ 1983 ."), | ||||
|     ("사과·배·복숭아·수박은 모두 과일이다.", "사과 · 배 · 복숭아 · 수박은 모두 과일이다 ."), | ||||
|     ("그렇구나~", "그렇구나~"), | ||||
|     ("『9시 반의 당구』,", "『 9시 반의 당구 』 ,"), | ||||
| ] | ||||
| # fmt: on | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("text,expected_tokens", SPACY_TOKENIZER_TESTS) | ||||
| def test_ko_spacy_tokenizer(ko_tokenizer_tokenizer, text, expected_tokens): | ||||
|     tokens = [token.text for token in ko_tokenizer_tokenizer(text)] | ||||
|     assert tokens == expected_tokens.split() | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user