Fix JA Morph Values (#9449)

* Don't set empty / weird values in morph * Update tests to handy empty morph values * Fix everything * Replace potentially problematic characters * Fix test
2025-10-16 00:36:58 +03:00 · 2021-10-14 07:21:36 +00:00 · 2021-10-14 07:21:36 +00:00 · a3b7519aba
commit a3b7519aba
parent ae1b3e960b
2 changed files with 16 additions and 6 deletions
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -3,6 +3,7 @@ from pathlib import Path
 import srsly
 from collections import namedtuple
 from thinc.api import Model
+import re

 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
@ -77,10 +78,14 @@ class JapaneseTokenizer(DummyTokenizer):
            # if there's no lemma info (it's an unk) just use the surface
            token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
            morph = {}
+            if dtoken.inf:
+                # it's normal for this to be empty for non-inflecting types
                morph["inflection"] = dtoken.inf
            token.norm_ = dtoken.norm
            if dtoken.reading:
-                morph["reading"] = dtoken.reading
+                # punctuation is its own reading, but we don't want values like
+                # "=" here
+                morph["reading"] = re.sub("[=|]", "_", dtoken.reading)
            token.morph = MorphAnalysis(self.vocab, morph)
        if self.need_subtokens:
            doc.user_data["sub_tokens"] = sub_tokens_list
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -130,8 +130,13 @@ def test_ja_tokenizer_sub_tokens(
    [
        (
            "取ってつけた",
-            ("五段-ラ行;連用形-促音便", "", "下一段-カ行;連用形-一般", "助動詞-タ;終止形-一般"),
-            ("トッ", "テ", "ツケ", "タ"),
+            (["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
+            (["トッ"], ["テ"], ["ツケ"], ["タ"]),
+        ),
+        (
+            "2=3",
+            ([], [], []),
+            (["ニ"], ["_"], ["サン"])
        ),
    ],
 )
@ -139,9 +144,9 @@ def test_ja_tokenizer_inflections_reading_forms(
    ja_tokenizer, text, inflections, reading_forms
 ):
    tokens = ja_tokenizer(text)
-    test_inflections = [tt.morph.get("inflection")[0] for tt in tokens]
+    test_inflections = [tt.morph.get("inflection") for tt in tokens]
    assert test_inflections == list(inflections)
-    test_readings = [tt.morph.get("reading")[0] for tt in tokens]
+    test_readings = [tt.morph.get("reading") for tt in tokens]
    assert test_readings == list(reading_forms)