mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Before this patch, half-width spaces between words were simply lost in Japanese text. This wasn't immediately noticeable because much Japanese text never uses spaces at all.
This commit is contained in:
parent
3c3658ef9f
commit
29a9e636eb
|
@ -37,6 +37,11 @@ def resolve_pos(token):
|
|||
in the sentence. This function adds information to the POS tag to
|
||||
resolve ambiguous mappings.
|
||||
"""
|
||||
|
||||
# this is only used for consecutive ascii spaces
|
||||
if token.pos == '空白':
|
||||
return '空白'
|
||||
|
||||
# TODO: This is a first take. The rules here are crude approximations.
|
||||
# For many of these, full dependencies are needed to properly resolve
|
||||
# PoS mappings.
|
||||
|
@ -54,6 +59,7 @@ def detailed_tokens(tokenizer, text):
|
|||
node = tokenizer.parseToNode(text)
|
||||
node = node.next # first node is beginning of sentence and empty, skip it
|
||||
words = []
|
||||
spaces = []
|
||||
while node.posid != 0:
|
||||
surface = node.surface
|
||||
base = surface # a default value. Updated if available later.
|
||||
|
@ -64,8 +70,20 @@ def detailed_tokens(tokenizer, text):
|
|||
# dictionary
|
||||
base = parts[7]
|
||||
words.append(ShortUnitWord(surface, base, pos))
|
||||
|
||||
# The way MeCab stores spaces is that the rlength of the next token is
|
||||
# the length of that token plus any preceding whitespace, **in bytes**.
|
||||
# also note that this is only for half-width / ascii spaces. Full width
|
||||
# spaces just become tokens.
|
||||
scount = node.next.rlength - node.next.length
|
||||
spaces.append(bool(scount))
|
||||
while scount > 1:
|
||||
words.append(ShortUnitWord(' ', ' ', '空白'))
|
||||
spaces.append(False)
|
||||
scount -= 1
|
||||
|
||||
node = node.next
|
||||
return words
|
||||
return words, spaces
|
||||
|
||||
|
||||
class JapaneseTokenizer(DummyTokenizer):
|
||||
|
@ -75,9 +93,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
self.tokenizer.parseToNode("") # see #2901
|
||||
|
||||
def __call__(self, text):
|
||||
dtokens = detailed_tokens(self.tokenizer, text)
|
||||
dtokens, spaces = detailed_tokens(self.tokenizer, text)
|
||||
words = [x.surface for x in dtokens]
|
||||
spaces = [False] * len(words)
|
||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||
mecab_tags = []
|
||||
for token, dtoken in zip(doc, dtokens):
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
|
||||
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET
|
||||
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
|
@ -21,6 +21,8 @@ TAG_MAP = {
|
|||
"感動詞,一般,*,*": {POS: INTJ},
|
||||
# this is specifically for unicode full-width space
|
||||
"空白,*,*,*": {POS: X},
|
||||
# This is used when sequential half-width spaces are present
|
||||
"空白": {POS: SPACE},
|
||||
"形状詞,一般,*,*": {POS: ADJ},
|
||||
"形状詞,タリ,*,*": {POS: ADJ},
|
||||
"形状詞,助動詞語幹,*,*": {POS: ADJ},
|
||||
|
|
|
@ -47,3 +47,9 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
|
|||
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
||||
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||
assert pos == expected_pos
|
||||
|
||||
def test_extra_spaces(ja_tokenizer):
|
||||
# note: three spaces after "I"
|
||||
tokens = ja_tokenizer("I like cheese.")
|
||||
assert tokens[1].orth_ == ' '
|
||||
assert tokens[2].orth_ == ' '
|
||||
|
|
Loading…
Reference in New Issue
Block a user