From 29a9e636eb8b0f22f91eee85b4a10b8cdade4ed2 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 13 Sep 2019 23:28:12 +0900 Subject: [PATCH] Fix half-width space handling in JA (#4284) (closes #4262) Before this patch, half-width spaces between words were simply lost in Japanese text. This wasn't immediately noticeable because much Japanese text never uses spaces at all. --- spacy/lang/ja/__init__.py | 23 ++++++++++++++++++++--- spacy/lang/ja/tag_map.py | 4 +++- spacy/tests/lang/ja/test_tokenizer.py | 6 ++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 3a6074bba..791b1ec33 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -37,6 +37,11 @@ def resolve_pos(token): in the sentence. This function adds information to the POS tag to resolve ambiguous mappings. """ + + # this is only used for consecutive ascii spaces + if token.pos == '空白': + return '空白' + # TODO: This is a first take. The rules here are crude approximations. # For many of these, full dependencies are needed to properly resolve # PoS mappings. @@ -54,6 +59,7 @@ def detailed_tokens(tokenizer, text): node = tokenizer.parseToNode(text) node = node.next # first node is beginning of sentence and empty, skip it words = [] + spaces = [] while node.posid != 0: surface = node.surface base = surface # a default value. Updated if available later. @@ -64,8 +70,20 @@ def detailed_tokens(tokenizer, text): # dictionary base = parts[7] words.append(ShortUnitWord(surface, base, pos)) + + # The way MeCab stores spaces is that the rlength of the next token is + # the length of that token plus any preceding whitespace, **in bytes**. + # also note that this is only for half-width / ascii spaces. Full width + # spaces just become tokens. + scount = node.next.rlength - node.next.length + spaces.append(bool(scount)) + while scount > 1: + words.append(ShortUnitWord(' ', ' ', '空白')) + spaces.append(False) + scount -= 1 + node = node.next - return words + return words, spaces class JapaneseTokenizer(DummyTokenizer): @@ -75,9 +93,8 @@ class JapaneseTokenizer(DummyTokenizer): self.tokenizer.parseToNode("") # see #2901 def __call__(self, text): - dtokens = detailed_tokens(self.tokenizer, text) + dtokens, spaces = detailed_tokens(self.tokenizer, text) words = [x.surface for x in dtokens] - spaces = [False] * len(words) doc = Doc(self.vocab, words=words, spaces=spaces) mecab_tags = [] for token, dtoken in zip(doc, dtokens): diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 6b114eb10..4ff0a35ee 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN -from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET +from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE TAG_MAP = { @@ -21,6 +21,8 @@ TAG_MAP = { "感動詞,一般,*,*": {POS: INTJ}, # this is specifically for unicode full-width space "空白,*,*,*": {POS: X}, + # This is used when sequential half-width spaces are present + "空白": {POS: SPACE}, "形状詞,一般,*,*": {POS: ADJ}, "形状詞,タリ,*,*": {POS: ADJ}, "形状詞,助動詞語幹,*,*": {POS: ADJ}, diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index c95e7bc40..38ca37bc9 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -47,3 +47,9 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos + +def test_extra_spaces(ja_tokenizer): + # note: three spaces after "I" + tokens = ja_tokenizer("I like cheese.") + assert tokens[1].orth_ == ' ' + assert tokens[2].orth_ == ' '