mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Update Korean defaults for Tokenizer (#10322)
Update Korean defaults for `Tokenizer` for tokenization following UD Korean Kaist.
This commit is contained in:
		
							parent
							
								
									f32ee2e533
								
							
						
					
					
						commit
						30030176ee
					
				|  | @ -1,5 +1,6 @@ | ||||||
| from typing import Iterator, Any, Dict | from typing import Iterator, Any, Dict | ||||||
| 
 | 
 | ||||||
|  | from .punctuation import TOKENIZER_INFIXES | ||||||
| from .stop_words import STOP_WORDS | from .stop_words import STOP_WORDS | ||||||
| from .tag_map import TAG_MAP | from .tag_map import TAG_MAP | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
|  | @ -85,6 +86,7 @@ class KoreanDefaults(BaseDefaults): | ||||||
|     lex_attr_getters = LEX_ATTRS |     lex_attr_getters = LEX_ATTRS | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
|     writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} |     writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} | ||||||
|  |     infixes = TOKENIZER_INFIXES | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Korean(Language): | class Korean(Language): | ||||||
|  |  | ||||||
							
								
								
									
										12
									
								
								spacy/lang/ko/punctuation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								spacy/lang/ko/punctuation.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,12 @@ | ||||||
|  | from ..char_classes import LIST_QUOTES | ||||||
|  | from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | _infixes = ( | ||||||
|  |     ["·", "ㆍ", "\(", "\)"] | ||||||
|  |     + [r"(?<=[0-9])~(?=[0-9-])"] | ||||||
|  |     + LIST_QUOTES | ||||||
|  |     + BASE_TOKENIZER_INFIXES | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | TOKENIZER_INFIXES = _infixes | ||||||
|  | @ -227,6 +227,19 @@ def ko_tokenizer(): | ||||||
|     return get_lang_class("ko")().tokenizer |     return get_lang_class("ko")().tokenizer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture(scope="session") | ||||||
|  | def ko_tokenizer_tokenizer(): | ||||||
|  |     config = { | ||||||
|  |         "nlp": { | ||||||
|  |             "tokenizer": { | ||||||
|  |                 "@tokenizers": "spacy.Tokenizer.v1", | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     nlp = get_lang_class("ko").from_config(config) | ||||||
|  |     return nlp.tokenizer | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
| def lb_tokenizer(): | def lb_tokenizer(): | ||||||
|     return get_lang_class("lb")().tokenizer |     return get_lang_class("lb")().tokenizer | ||||||
|  |  | ||||||
|  | @ -47,3 +47,23 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): | ||||||
| def test_ko_empty_doc(ko_tokenizer): | def test_ko_empty_doc(ko_tokenizer): | ||||||
|     tokens = ko_tokenizer("") |     tokens = ko_tokenizer("") | ||||||
|     assert len(tokens) == 0 |     assert len(tokens) == 0 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # fmt: off | ||||||
|  | SPACY_TOKENIZER_TESTS = [ | ||||||
|  |     ("있다.", "있다 ."), | ||||||
|  |     ("'예'는", "' 예 ' 는"), | ||||||
|  |     ("부 (富) 는", "부 ( 富 ) 는"), | ||||||
|  |     ("부(富)는", "부 ( 富 ) 는"), | ||||||
|  |     ("1982~1983.", "1982 ~ 1983 ."), | ||||||
|  |     ("사과·배·복숭아·수박은 모두 과일이다.", "사과 · 배 · 복숭아 · 수박은 모두 과일이다 ."), | ||||||
|  |     ("그렇구나~", "그렇구나~"), | ||||||
|  |     ("『9시 반의 당구』,", "『 9시 반의 당구 』 ,"), | ||||||
|  | ] | ||||||
|  | # fmt: on | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize("text,expected_tokens", SPACY_TOKENIZER_TESTS) | ||||||
|  | def test_ko_spacy_tokenizer(ko_tokenizer_tokenizer, text, expected_tokens): | ||||||
|  |     tokens = [token.text for token in ko_tokenizer_tokenizer(text)] | ||||||
|  |     assert tokens == expected_tokens.split() | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user