Fix JA Morph Values (#9449)

* Don't set empty / weird values in morph

* Update tests to handy empty morph values

* Fix everything

* Replace potentially problematic characters

* Fix test
This commit is contained in:
Paul O'Leary McCann 2021-10-14 07:21:36 +00:00 committed by GitHub
parent ae1b3e960b
commit a3b7519aba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 6 deletions

View File

@ -3,6 +3,7 @@ from pathlib import Path
import srsly import srsly
from collections import namedtuple from collections import namedtuple
from thinc.api import Model from thinc.api import Model
import re
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
@ -77,10 +78,14 @@ class JapaneseTokenizer(DummyTokenizer):
# if there's no lemma info (it's an unk) just use the surface # if there's no lemma info (it's an unk) just use the surface
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
morph = {} morph = {}
if dtoken.inf:
# it's normal for this to be empty for non-inflecting types
morph["inflection"] = dtoken.inf morph["inflection"] = dtoken.inf
token.norm_ = dtoken.norm token.norm_ = dtoken.norm
if dtoken.reading: if dtoken.reading:
morph["reading"] = dtoken.reading # punctuation is its own reading, but we don't want values like
# "=" here
morph["reading"] = re.sub("[=|]", "_", dtoken.reading)
token.morph = MorphAnalysis(self.vocab, morph) token.morph = MorphAnalysis(self.vocab, morph)
if self.need_subtokens: if self.need_subtokens:
doc.user_data["sub_tokens"] = sub_tokens_list doc.user_data["sub_tokens"] = sub_tokens_list

View File

@ -130,8 +130,13 @@ def test_ja_tokenizer_sub_tokens(
[ [
( (
"取ってつけた", "取ってつけた",
("五段-ラ行;連用形-促音便", "", "下一段-カ行;連用形-一般", "助動詞-タ;終止形-一般"), (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
("トッ", "", "ツケ", ""), (["トッ"], [""], ["ツケ"], [""]),
),
(
"2=3",
([], [], []),
([""], ["_"], ["サン"])
), ),
], ],
) )
@ -139,9 +144,9 @@ def test_ja_tokenizer_inflections_reading_forms(
ja_tokenizer, text, inflections, reading_forms ja_tokenizer, text, inflections, reading_forms
): ):
tokens = ja_tokenizer(text) tokens = ja_tokenizer(text)
test_inflections = [tt.morph.get("inflection")[0] for tt in tokens] test_inflections = [tt.morph.get("inflection") for tt in tokens]
assert test_inflections == list(inflections) assert test_inflections == list(inflections)
test_readings = [tt.morph.get("reading")[0] for tt in tokens] test_readings = [tt.morph.get("reading") for tt in tokens]
assert test_readings == list(reading_forms) assert test_readings == list(reading_forms)