Fix half-width space handling in JA (#4284) (closes #4262)

Before this patch, half-width spaces between words were simply lost in
Japanese text. This wasn't immediately noticeable because much Japanese
text never uses spaces at all.
This commit is contained in:
Paul O'Leary McCann 2019-09-13 23:28:12 +09:00 committed by Ines Montani
parent 3c3658ef9f
commit 29a9e636eb
3 changed files with 29 additions and 4 deletions

View File

@ -37,6 +37,11 @@ def resolve_pos(token):
in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings.
"""
# this is only used for consecutive ascii spaces
if token.pos == '空白':
return '空白'
# TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve
# PoS mappings.
@ -54,6 +59,7 @@ def detailed_tokens(tokenizer, text):
node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it
words = []
spaces = []
while node.posid != 0:
surface = node.surface
base = surface # a default value. Updated if available later.
@ -64,8 +70,20 @@ def detailed_tokens(tokenizer, text):
# dictionary
base = parts[7]
words.append(ShortUnitWord(surface, base, pos))
# The way MeCab stores spaces is that the rlength of the next token is
# the length of that token plus any preceding whitespace, **in bytes**.
# also note that this is only for half-width / ascii spaces. Full width
# spaces just become tokens.
scount = node.next.rlength - node.next.length
spaces.append(bool(scount))
while scount > 1:
words.append(ShortUnitWord(' ', ' ', '空白'))
spaces.append(False)
scount -= 1
node = node.next
return words
return words, spaces
class JapaneseTokenizer(DummyTokenizer):
@ -75,9 +93,8 @@ class JapaneseTokenizer(DummyTokenizer):
self.tokenizer.parseToNode("") # see #2901
def __call__(self, text):
dtokens = detailed_tokens(self.tokenizer, text)
dtokens, spaces = detailed_tokens(self.tokenizer, text)
words = [x.surface for x in dtokens]
spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces)
mecab_tags = []
for token, dtoken in zip(doc, dtokens):

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals
from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
TAG_MAP = {
@ -21,6 +21,8 @@ TAG_MAP = {
"感動詞,一般,*,*": {POS: INTJ},
# this is specifically for unicode full-width space
"空白,*,*,*": {POS: X},
# This is used when sequential half-width spaces are present
"空白": {POS: SPACE},
"形状詞,一般,*,*": {POS: ADJ},
"形状詞,タリ,*,*": {POS: ADJ},
"形状詞,助動詞語幹,*,*": {POS: ADJ},

View File

@ -47,3 +47,9 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ja_tokenizer(text)]
assert pos == expected_pos
def test_extra_spaces(ja_tokenizer):
# note: three spaces after "I"
tokens = ja_tokenizer("I like cheese.")
assert tokens[1].orth_ == ' '
assert tokens[2].orth_ == ' '