Fix half-width space handling in JA (#4284) (closes #4262)

Before this patch, half-width spaces between words were simply lost in
Japanese text. This wasn't immediately noticeable because much Japanese
text never uses spaces at all.
This commit is contained in:
Paul O'Leary McCann 2019-09-13 23:28:12 +09:00 committed by Ines Montani
parent 3c3658ef9f
commit 29a9e636eb
3 changed files with 29 additions and 4 deletions

View File

@ -37,6 +37,11 @@ def resolve_pos(token):
in the sentence. This function adds information to the POS tag to in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings. resolve ambiguous mappings.
""" """
# this is only used for consecutive ascii spaces
if token.pos == '空白':
return '空白'
# TODO: This is a first take. The rules here are crude approximations. # TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve # For many of these, full dependencies are needed to properly resolve
# PoS mappings. # PoS mappings.
@ -54,6 +59,7 @@ def detailed_tokens(tokenizer, text):
node = tokenizer.parseToNode(text) node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it node = node.next # first node is beginning of sentence and empty, skip it
words = [] words = []
spaces = []
while node.posid != 0: while node.posid != 0:
surface = node.surface surface = node.surface
base = surface # a default value. Updated if available later. base = surface # a default value. Updated if available later.
@ -64,8 +70,20 @@ def detailed_tokens(tokenizer, text):
# dictionary # dictionary
base = parts[7] base = parts[7]
words.append(ShortUnitWord(surface, base, pos)) words.append(ShortUnitWord(surface, base, pos))
# The way MeCab stores spaces is that the rlength of the next token is
# the length of that token plus any preceding whitespace, **in bytes**.
# also note that this is only for half-width / ascii spaces. Full width
# spaces just become tokens.
scount = node.next.rlength - node.next.length
spaces.append(bool(scount))
while scount > 1:
words.append(ShortUnitWord(' ', ' ', '空白'))
spaces.append(False)
scount -= 1
node = node.next node = node.next
return words return words, spaces
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
@ -75,9 +93,8 @@ class JapaneseTokenizer(DummyTokenizer):
self.tokenizer.parseToNode("") # see #2901 self.tokenizer.parseToNode("") # see #2901
def __call__(self, text): def __call__(self, text):
dtokens = detailed_tokens(self.tokenizer, text) dtokens, spaces = detailed_tokens(self.tokenizer, text)
words = [x.surface for x in dtokens] words = [x.surface for x in dtokens]
spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces) doc = Doc(self.vocab, words=words, spaces=spaces)
mecab_tags = [] mecab_tags = []
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
TAG_MAP = { TAG_MAP = {
@ -21,6 +21,8 @@ TAG_MAP = {
"感動詞,一般,*,*": {POS: INTJ}, "感動詞,一般,*,*": {POS: INTJ},
# this is specifically for unicode full-width space # this is specifically for unicode full-width space
"空白,*,*,*": {POS: X}, "空白,*,*,*": {POS: X},
# This is used when sequential half-width spaces are present
"空白": {POS: SPACE},
"形状詞,一般,*,*": {POS: ADJ}, "形状詞,一般,*,*": {POS: ADJ},
"形状詞,タリ,*,*": {POS: ADJ}, "形状詞,タリ,*,*": {POS: ADJ},
"形状詞,助動詞語幹,*,*": {POS: ADJ}, "形状詞,助動詞語幹,*,*": {POS: ADJ},

View File

@ -47,3 +47,9 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ja_tokenizer(text)] pos = [token.pos_ for token in ja_tokenizer(text)]
assert pos == expected_pos assert pos == expected_pos
def test_extra_spaces(ja_tokenizer):
# note: three spaces after "I"
tokens = ja_tokenizer("I like cheese.")
assert tokens[1].orth_ == ' '
assert tokens[2].orth_ == ' '