From 227f98081b86b315907ec672f02b0a2334dd10e8 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 7 Oct 2021 17:14:05 +0900 Subject: [PATCH] Use a pipe for separating Japanese inflections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inflection values look like this pipe separated: 五段-ラ行|連用形-促音便 So using a hyphen erases the original fields. --- spacy/lang/ja/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 0f25b1fc1..0695415be 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -94,7 +94,7 @@ class JapaneseTokenizer(DummyTokenizer): DetailedToken( token.surface(), # orth "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag - "-".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf + "|".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf token.dictionary_form(), # lemma token.normalized_form(), token.reading_form(),