mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-09 08:00:34 +03:00
Fix JA Morph Values (#9449)
* Don't set empty / weird values in morph * Update tests to handy empty morph values * Fix everything * Replace potentially problematic characters * Fix test
This commit is contained in:
parent
ae1b3e960b
commit
a3b7519aba
|
@ -3,6 +3,7 @@ from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
import re
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
@ -77,10 +78,14 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
# if there's no lemma info (it's an unk) just use the surface
|
# if there's no lemma info (it's an unk) just use the surface
|
||||||
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface
|
||||||
morph = {}
|
morph = {}
|
||||||
|
if dtoken.inf:
|
||||||
|
# it's normal for this to be empty for non-inflecting types
|
||||||
morph["inflection"] = dtoken.inf
|
morph["inflection"] = dtoken.inf
|
||||||
token.norm_ = dtoken.norm
|
token.norm_ = dtoken.norm
|
||||||
if dtoken.reading:
|
if dtoken.reading:
|
||||||
morph["reading"] = dtoken.reading
|
# punctuation is its own reading, but we don't want values like
|
||||||
|
# "=" here
|
||||||
|
morph["reading"] = re.sub("[=|]", "_", dtoken.reading)
|
||||||
token.morph = MorphAnalysis(self.vocab, morph)
|
token.morph = MorphAnalysis(self.vocab, morph)
|
||||||
if self.need_subtokens:
|
if self.need_subtokens:
|
||||||
doc.user_data["sub_tokens"] = sub_tokens_list
|
doc.user_data["sub_tokens"] = sub_tokens_list
|
||||||
|
|
|
@ -130,8 +130,13 @@ def test_ja_tokenizer_sub_tokens(
|
||||||
[
|
[
|
||||||
(
|
(
|
||||||
"取ってつけた",
|
"取ってつけた",
|
||||||
("五段-ラ行;連用形-促音便", "", "下一段-カ行;連用形-一般", "助動詞-タ;終止形-一般"),
|
(["五段-ラ行;連用形-促音便"], [], ["下一段-カ行;連用形-一般"], ["助動詞-タ;終止形-一般"]),
|
||||||
("トッ", "テ", "ツケ", "タ"),
|
(["トッ"], ["テ"], ["ツケ"], ["タ"]),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"2=3",
|
||||||
|
([], [], []),
|
||||||
|
(["ニ"], ["_"], ["サン"])
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -139,9 +144,9 @@ def test_ja_tokenizer_inflections_reading_forms(
|
||||||
ja_tokenizer, text, inflections, reading_forms
|
ja_tokenizer, text, inflections, reading_forms
|
||||||
):
|
):
|
||||||
tokens = ja_tokenizer(text)
|
tokens = ja_tokenizer(text)
|
||||||
test_inflections = [tt.morph.get("inflection")[0] for tt in tokens]
|
test_inflections = [tt.morph.get("inflection") for tt in tokens]
|
||||||
assert test_inflections == list(inflections)
|
assert test_inflections == list(inflections)
|
||||||
test_readings = [tt.morph.get("reading")[0] for tt in tokens]
|
test_readings = [tt.morph.get("reading") for tt in tokens]
|
||||||
assert test_readings == list(reading_forms)
|
assert test_readings == list(reading_forms)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user