mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Fix JA Morph Values (#9449)
* Don't set empty / weird values in morph * Update tests to handy empty morph values * Fix everything * Replace potentially problematic characters * Fix test
This commit is contained in:
		
							parent
							
								
									ae1b3e960b
								
							
						
					
					
						commit
						a3b7519aba
					
				| 
						 | 
					@ -3,6 +3,7 @@ from pathlib import Path
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from collections import namedtuple
 | 
					from collections import namedtuple
 | 
				
			||||||
from thinc.api import Model
 | 
					from thinc.api import Model
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
| 
						 | 
					@ -77,10 +78,14 @@ class JapaneseTokenizer(DummyTokenizer):
 | 
				
			||||||
            # if there's no lemma info (it's an unk) just use the surface
 | 
					            # if there's no lemma info (it's an unk) just use the surface
 | 
				
			||||||
            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
 | 
					            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
 | 
				
			||||||
            morph = {}
 | 
					            morph = {}
 | 
				
			||||||
            morph["inflection"] = dtoken.inf
 | 
					            if dtoken.inf:
 | 
				
			||||||
 | 
					                # it's normal for this to be empty for non-inflecting types
 | 
				
			||||||
 | 
					                morph["inflection"] = dtoken.inf
 | 
				
			||||||
            token.norm_ = dtoken.norm
 | 
					            token.norm_ = dtoken.norm
 | 
				
			||||||
            if dtoken.reading:
 | 
					            if dtoken.reading:
 | 
				
			||||||
                morph["reading"] = dtoken.reading
 | 
					                # punctuation is its own reading, but we don't want values like
 | 
				
			||||||
 | 
					                # "=" here
 | 
				
			||||||
 | 
					                morph["reading"] = re.sub("[=|]", "_", dtoken.reading)
 | 
				
			||||||
            token.morph = MorphAnalysis(self.vocab, morph)
 | 
					            token.morph = MorphAnalysis(self.vocab, morph)
 | 
				
			||||||
        if self.need_subtokens:
 | 
					        if self.need_subtokens:
 | 
				
			||||||
            doc.user_data["sub_tokens"] = sub_tokens_list
 | 
					            doc.user_data["sub_tokens"] = sub_tokens_list
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -130,8 +130,13 @@ def test_ja_tokenizer_sub_tokens(
 | 
				
			||||||
    [
 | 
					    [
 | 
				
			||||||
        (
 | 
					        (
 | 
				
			||||||
            "取ってつけた",
 | 
					            "取ってつけた",
 | 
				
			||||||
            ("五段-ラ行;連用形-促音便", "", "下一段-カ行;連用形-一般", "助動詞-タ;終止形-一般"),
 | 
					            (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
 | 
				
			||||||
            ("トッ", "テ", "ツケ", "タ"),
 | 
					            (["トッ"], ["テ"], ["ツケ"], ["タ"]),
 | 
				
			||||||
 | 
					        ),
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            "2=3",
 | 
				
			||||||
 | 
					            ([], [], []),
 | 
				
			||||||
 | 
					            (["ニ"], ["_"], ["サン"])
 | 
				
			||||||
        ),
 | 
					        ),
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
| 
						 | 
					@ -139,9 +144,9 @@ def test_ja_tokenizer_inflections_reading_forms(
 | 
				
			||||||
    ja_tokenizer, text, inflections, reading_forms
 | 
					    ja_tokenizer, text, inflections, reading_forms
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    tokens = ja_tokenizer(text)
 | 
					    tokens = ja_tokenizer(text)
 | 
				
			||||||
    test_inflections = [tt.morph.get("inflection")[0] for tt in tokens]
 | 
					    test_inflections = [tt.morph.get("inflection") for tt in tokens]
 | 
				
			||||||
    assert test_inflections == list(inflections)
 | 
					    assert test_inflections == list(inflections)
 | 
				
			||||||
    test_readings = [tt.morph.get("reading")[0] for tt in tokens]
 | 
					    test_readings = [tt.morph.get("reading") for tt in tokens]
 | 
				
			||||||
    assert test_readings == list(reading_forms)
 | 
					    assert test_readings == list(reading_forms)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user